Estefani Gaytan Nunez

scripts

#!/bin/python3
from optparse import OptionParser
import re
import os
import random
# Objective
# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
# make data sets using only sentences with at least one true-tag
#
# Input parameters
# --inputPath=PATH Path of inputfile
# --outputPath=PATH Path to place output files
# --trainingFile=testFile Output training data set
# --testFile=testFile Output test data set
#
# Output
# training and test data set
#
# Examples
# python label-split_training_test_v2.py
# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
# --inputFile sentences.tsv_pakal_.conll
# --trainingFile training-data-set-70.txt
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
#
#
# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path of output from CoreNLP", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path to place output files",
metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="File with CoreNLP-tagging sentences", metavar="FILE")
parser.add_option("--trainingFile", dest="trainingFile",
help="File with training data set", metavar="FILE")
parser.add_option("--testFile", dest="testFile",
help="File with test data set", metavar="FILE")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path of CoreNLP output: " + options.inputPath)
print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
print("Path of training data set: " + options.outputPath)
print("File with training data set: " + str(options.trainingFile))
print("Path of test data set: " + options.outputPath)
print("File with test data set: " + str(options.testFile))
print('-------------------------------- PROCESSING --------------------------------')
## begin of tagging
in_labels = {
'<Gtype>': 'Gtype',
'<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Phase>': 'Phase',
'<Supp>': 'Supp',
'<Technique>': 'Technique',
'<Temp>': 'Temp',
'<OD>': 'OD',
'<Anti>': 'Anti'
}
## End of tagging
out_labels = {
'<Air>': 'O',
'</Air>': 'O',
'</Gtype>': 'O',
'</Gversion>': 'O',
'</Med>': 'O',
'</Phase>': 'O',
'<Sample>': 'O',
'</Sample>': 'O',
'<Serie>': 'O',
'</Serie>': 'O',
'<Strain>': 'O',
'</Strain>': 'O',
'<Substrain>': 'O',
'</Substrain>': 'O',
'</Supp>': 'O',
'</Technique>': 'O',
'</Temp>': 'O',
'</OD>': 'O',
'<Agit>': 'O',
'</Agit>': 'O',
'<Name>': 'O',
'</Name>': 'O',
'<Orgn>': 'O',
'</Orgn>': 'O',
'</Anti>': 'O',
'<Vess>': 'O',
'</Vess>': 'O'}
# Other label
flag = 'O'
# sentences counter
lista = []
#First sentence
sentence = ''
with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w in in_labels or w in out_labels:
#Tagging
if w in in_labels.keys(): flag = in_labels[w]
if w in out_labels: flag = out_labels[w]
else:
if w == "PGCGROWTHCONDITIONS":
words = sentence.split(' ')
#End of sentence
tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values() ]
#At least one true-tag on sentence
if len(tags)> 0:
lista.append(sentence)
#New setence
sentence = ''
else:
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
print("Number of sentences: " + str( len(lista) ) )
# Split 70 30 training and test sentences
trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
Data = [lista[i] for i in trainingIndex]
oFile.write('\n'.join(Data))
with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
Data = [lista[i] for i in testIndex]
oFile.write('\n'.join(Data))
print("==================================END===================================")
#!/bin/python3
from optparse import OptionParser
import re
import os
import random
# Objective
# Labaled separated by '|' and split 70/30 sentences on training and tets files from CoreNLP-tagging
# make data sets using only sentences with at least one true-tag
#
# Input parameters
# --inputPath=PATH Path of inputfile
# --outputPath=PATH Path to place output files
# --trainingFile=testFile Output training data set
# --testFile=testFile Output test data set
#
# Output
# training and test data set
#
# Examples
# python label-split_training_test_v2.py
# --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/
# --inputFile sentences.tsv_pakal_.conll
# --trainingFile training-data-set-70.txt
# --testFile test-data-set-30.txt
# --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
#
#
# python label-split_training_test_v2.py --inputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/output/ --inputFile raw-metadata-senteneces.txt.conll --trainingFile training-data-set-70_v2.txt --testFile test-data-set-30_v2.txt --outputPath /home/egaytan/GROWTH-CONDITIONS-GEO-EXTRACTION/CRF/data-sets
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path of output from CoreNLP", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path to place output files",
metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="File with CoreNLP-tagging sentences", metavar="FILE")
parser.add_option("--trainingFile", dest="trainingFile",
help="File with training data set", metavar="FILE")
parser.add_option("--testFile", dest="testFile",
help="File with test data set", metavar="FILE")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path of CoreNLP output: " + options.inputPath)
print("File with CoreNLP-tagging sentences: " + str(options.inputFile))
print("Path of training data set: " + options.outputPath)
print("File with training data set: " + str(options.trainingFile))
print("Path of test data set: " + options.outputPath)
print("File with test data set: " + str(options.testFile))
print('-------------------------------- PROCESSING --------------------------------')
## begin of tagging
in_labels = {
'<Gtype>': 'Gtype',
'<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Phase>': 'Phase',
'<Supp>': 'Supp',
'<Technique>': 'Technique',
'<Temp>': 'Temp',
'<OD>': 'OD',
'<Anti>': 'Anti',
'<Agit>': 'Agit',
'<Vess>': 'Vess'
}
## End of tagging
out_labels = {
'<Air>': 'O',
'</Air>': 'O',
'</Gtype>': 'O',
'</Gversion>': 'O',
'</Med>': 'O',
'</Phase>': 'O',
'<Sample>': 'O',
'</Sample>': 'O',
'<Serie>': 'O',
'</Serie>': 'O',
'<Strain>': 'O',
'</Strain>': 'O',
'<Substrain>': 'O',
'</Substrain>': 'O',
'</Supp>': 'O',
'</Technique>': 'O',
'</Temp>': 'O',
'</OD>': 'O',
'</Anti>': 'O',
'</Agit>': 'O',
'<Name>': 'O',
'</Name>': 'O',
'<Orgn>': 'O',
'</Orgn>': 'O',
'</Vess>': 'O'}
# Other label
flag = 'O'
# sentences counter
n=0
lista = []
#First sentence
sentence = ''
with open(os.path.join(options.inputPath, options.inputFile), "r") as input_file:
for line in input_file:
if len(line.split('\t')) > 1:
w = line.split('\t')[1]
if w in in_labels or w in out_labels:
#Tagging
if w in in_labels.keys(): flag = in_labels[w]
if w in out_labels: flag = out_labels[w]
else:
if w == "PGCGROWTHCONDITIONS":
words = sentence.split(' ')
tags = [tag for tag in words if word.split('|')[-1] in in_labels.values() ]
#At least one true-tag on sentence
if len(tags)> 0:
lista.append(sentence)
#New setence
sentence = ''
n=n+1
else:
#Building and save tagging sentence
sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:4])+'|'+flag+' ')
print("Number of sentences: " + str(n) + str(len(lista)+1))
# Split 70 30 training and test sentences
trainingIndex = random.sample(range(len(lista)-1), int(len(lista)*.70))
testIndex = [n for n in range(len(lista)-1) if n not in trainingIndex]
with open(os.path.join(options.outputPath, options.trainingFile), "w") as oFile:
Data = [lista[i] for i in trainingIndex]
oFile.write('\n'.join(Data))
with open(os.path.join(options.outputPath, options.testFile), "w") as oFile:
Data = [lista[i] for i in testIndex]
oFile.write('\n'.join(Data))
print("==================================END===================================")
......@@ -299,7 +299,7 @@ if __name__ == "__main__":
# Original: labels = list(crf.classes_)
# Original: labels.remove('O')
labels = list(['Air', 'Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti', 'Agit', 'Vess'])
labels = list(['Gtype', 'Gversion', 'Med', 'Phase', 'Supp', 'Technique', 'Temp', 'OD', 'Anti'])
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
......
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.