figures-tag-report.py 6.81 KB
from optparse import OptionParser
import re
from collections import defaultdict as df
import os
import random
from pandas import DataFrame as DF
import matplotlib.pyplot as plt

# Objective
# Drawn figures of grid reports 
#
# Input parameters
# --inputPath=PATH              Path of inputfiles
# --outputPath=PATH             Path to place output figures
# --figureName            single run specific name figure, multifigure first part of name
# --inputFile             Use it for a single report
# --version               CRF-script version of reports
#
# Output
# training and test data set
#
# Examples
# python figures-reports.py
# --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/reports/
# --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/figures/
# --figureName FiguresGrid
# --inputFile report_Run1_v11.txt
# --version v11

# python figures-tag-report.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/reports/ --outputPath /home/egaytan/automatic-extraction-growth-conditions/CRF/figures/ --figureName FiguresGrid_v11 --version v11
__author__ = 'egaytan'

####################################################################################
#                                   FUNCTIONS                                      #
####################################################################################
def Filter(rfile, options,v):
  if options[0]=='all':
    if rfile[0:6]=='report' and rfile[-7:-4]==v: return(True)
  elif rfile in options:
    return(True)
  return(False)

####################################################################################
#                                   MAIN PROGRAM                                  #
####################################################################################

if __name__ == '__main__':
    # Defining parameters
    parser = OptionParser()
    parser.add_option('--inputPath',  dest='inputPath',   help='Path of output from CoreNLP',           metavar='PATH')
    parser.add_option('--outputPath', dest='outputPath',  help='Path to place output figures',          metavar='PATH')
    parser.add_option('--figureName', dest='figureName',  help='Specific or first part of figurename',  metavar='FILE')
    parser.add_option('--version', dest='version',  help='script version',  metavar='FILE')
    parser.add_option('--inputFile',  dest='inputFile',   help='Use it for a specific report files',            metavar='FILE', default='all,')

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error('Any parameter given.\nFor multi input files be sure to seprate the filenames by coma')
        sys.exit(1)

    print('-------------------------------- PARAMETERS --------------------------------')
    print('Path of output from CoreNLP: ' + str(options.inputPath))
    print('Path to place output figures: ' + str(options.outputPath))
    print('Specific or first part of figurename: ' + str(options.figureName))
    print('CRF-script version: ' + str(options.version))

    print('-------------------------------- PROCESSING --------------------------------')

    rawInputRepotsList = str(options.inputFile).split(',')
    reportFileList = [ rfile for rfile in os.listdir(options.inputPath) if Filter(rfile, rawInputRepotsList, str(options.version)) ]
    scores = df(dict)
    #CV={}
    print('Report files: ' + str(options.inputFile  ))
    print('\n'.join(reportFileList))
    print('----------------------------------- NOTE -----------------------------------')
    print('\n-------- All chosen report files should be in inputPath given---------------\n')

    print('------------------------------- SAVING DATA --------------------------------\n')
    OD, pH, Technique, Med, Temp, Vess, Agit, Phase, Air, Anti, Strain, Gtype, Substrain, Supp, Gversion = [], [], [], [], [], [], [], [], [], [], [], [], [], [], []

    precision = df(list)
    recall = df(list)
    fscore = df(list)
    support  = df(list)
    for report in reportFileList:
        tags = {}
        with open(os.path.join(options.inputPath, report), 'r') as File:
            string = File.read()
            tags['OD']= re.findall('OD\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]     
            tags['pH']= re.findall('pH\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Technique']= re.findall('Technique\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Med']= re.findall('Med\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Temp']= re.findall('Temp\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Vess']= re.findall('Vess\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Agit']= re.findall('Agit\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Phase']= re.findall('Phase\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Air']= re.findall('Air\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Anti']= re.findall('Anti\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Strain']= re.findall('Strain\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Gtype']= re.findall('Gtype\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Substrain']= re.findall('Substrain\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Supp']= re.findall('Supp\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]
            tags['Gversion']= re.findall('Gversion\s+(\d+.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)', string)[0]

            for k in tags.keys():
                precision[k].append(float(tags[k][0]))
                recall[k].append(float(tags[k][1]))
                fscore[k].append(float(tags[k][2]))
                #support[k].append(tags[k][3])    
    print(DF(precision))
    print(precision)
    #lines = ['-', '--', '-.', ':', '.', ',', 'o', 'v', '^', '<', '>', '1', '2', '3', '4', 's', 'p', '*', 'h', 'H', '+', 'x', 'D', 'd', '|', '_']
    lines = ['-','--','-.',':','o','v','^','<','>','s','p','*','H','+','x','D','|']
    imageName = str(options.figureName) + '_' +  str(options.version)
    fig = plt.figure()
    plt.rcParams.update({'font.size': 15})
    fig.set_figheight(13)
    fig.set_figwidth(20)
    plt.xlabel("Runs")   
    plt.ylabel("score")
    plt.ylim(-0.2, 1.2)
    #lines=['-', '--', '-.', ':', ',', 'o', 'v', '^', '<', '>', '1', '2', '3', '4', 's', 'p', '*', 'h', 'H', '+', 'x', 'D', 'd', '|', '_']
    lines = [ 'r--', 'rs', 'r^', 'r:', 'rH',  'g--', 'gs', 'g^', 'g|', 'gH' , 'b--', 'bs', 'b^', 'b|', 'bH', 'r+']
    for i,k in enumerate(tags.keys()):
        plt.grid(False)	
        plt.plot(precision[k], lines[i], label=k, linewidth=8)
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.xticks(range(8),['run1', 'run2', 'run3', 'run4', 'run5', 'run6', 'run7', 'run8'])
    fig.savefig(imageName, bbox_inches='tight', pad_inches = 0.5)