label-split_training_test_v1.py 5.42 KB

Raw Blame History Permalink

#!/bin/python3
import os
from itertools import chain
from optparse import OptionParser
from time import time
from collections import Counter
import re

import nltk
import sklearn
import scipy.stats
import sys

from sklearn.externals import joblib
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

from nltk.corpus import stopwords
import random


# Objective
# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
#
# Input parameters
# --inputPath=PATH    		Path of inputfile
# --outputPath=PATH   		Path to place output files
# --trainingFile=testFile  	Output training data set
# --testFile=testFile  	  	Output test data set
#
# Output
# training and test data set
#
# Examples
# python label-split_training_test_v1.py
# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
# --inputFile sentences.tsv_pakal_.conll
# --trainingFile training-data-set-70.txt
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
#
#
# python label-split_training_test_v1.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile sentences.tsv_pakal_.conll --trainingFile training-data-set-70.txt --testFile test-data-set-30.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets


##########################################
#               MAIN PROGRAM             #
##########################################

if __name__ == "__main__":
    # Defining parameters
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Path of output from CoreNLP", metavar="PATH")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Output path to place output files",
                      metavar="PATH")
    parser.add_option("--inputFile", dest="inputFile",
                      help="File with CoreNLP-tagging sentences", metavar="FILE")
    parser.add_option("--trainingFile", dest="trainingFile",
                      help="File with training data set", metavar="FILE")
    parser.add_option("--testFile", dest="testFile",
                      help="File with test data set", metavar="FILE")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("Any parameter given.")
        sys.exit(1)

    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path of CoreNLP output: " + options.inputPath)
    print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
    print("Path of training data set: " + str(options.outputPath))
    print("File with training data set: " + str(options.trainingFile))
    print("Path of test data set: " + str(options.outputPath))
    print("File with test data set: " + str(options.testFile))
    print('-------------------------------- PROCESSING --------------------------------')
    ## begin of tagging
    in_labels = {
     '<Gtype>': 'Gtype',
     '<Gversion>': 'Gversion',
     '<Med>': 'Med',
     '<Phase>': 'Phase',
     '<Sample>': 'Sample',
     '<Serie>': 'Serie',
     '<Substrain>': 'Substrain',
     '<Supp>': 'Supp',
     '<Technique>': 'Technique',
     '<Temp>': 'Temp',
     '<OD>': 'OD',
     '<Anti>': 'Anti',
     '<Agit>': 'Agit',
     '<Vess>': 'Vess'
    }
    ## End of tagging
    out_labels = {
     '</Air>': 'O',
     '</Gtype>': 'O',
     '</Gversion>': 'O',
     '</Med>': 'O',
     '</Phase>': 'O',
     '</Sample>': 'O',
     '</Serie>': 'O',
     '</Strain>': 'O',
     '<Strain>': 'O',
     '</Substrain>': 'O',
     '</Supp>': 'O',
     '</Technique>': 'O',
     '</Temp>': 'O',
     '</OD>': 'O',
     '</Anti>': 'O',
     '</Agit>': 'O',
     '<Name>': 'O',
     '</Name>': 'O',
     '<Orgn>': 'O',
     '</Orgn>': 'O',
     '</Vess>': 'O'}

    # Other label
    flag = 'O'
    # sentences counter
    n=0
    lista = []
    #First sentence
    sentence = ''
    with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
	    for line in input_file:
		    if len(line.split('\t')) > 1:
			    w = line.split('\t')[1]
			    if w in in_labels or w in out_labels:
			    	#Tagging
				    if w in in_labels.keys(): flag = in_labels[w]
				    if w in out_labels: flag = out_labels[w]
			    else:
				    if w == "PGCGROWTHCONDITIONS":
				    	#End of sentence
					    lista.append(sentence)
					    #New setence
					    sentence = ''
					    n=n+1
				    else:
				    	#Building and save tagging sentence
					    sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')

    print("Number of sentences: " + str(n))

    # Split 70 30 training and test sentences
    trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
    testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
    print(len(trainingIndex))
    print(len(testIndex))

    with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
      Data = [lista[i]  for i in trainingIndex]
      oFile.write('\n'.join(Data))

    with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
      Data = [lista[i]  for i in testIndex]
      oFile.write('\n'.join(Data))

    print("==================================END===================================")