cmendezc

Nuevo procesamiento para reentrenamiento

import stanza
import argparse
import re
import os
import pandas as pd
# Objective
# Check if MCO terms appear in raw sentences from extracted sentences from softfiles
#
# Input parameters
# --inputPath=PATH Path to geo_sentences_to_check_fixed.csv
# /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputFile=PATH File geo_sentences_to_check_fixed.csv
# --inputPathMco Path to MCO term file
# /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco File with MCO terms GC_Terms.txt (tsv)
# --outputPath=PATH Path to place MCO terms that appeared in input file
#
# Output
# Files with MCO terms that appeared in input file
#
# _v1
# python check_mco_terms_in_sentences_v1.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputFile geo_sentences_to_check_fixed.csv
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco GC_Terms.txt
# python check_mco_terms_in_sentences_v1.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputFile geo_sentences_to_check_fixed.csv --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb --inputFileMco GC_Terms.txt
'''
RESULTADO DE LA REVISIÓN:
Sólo se encontraron los siguientes términos en las oraciones extraídas de la curación de los softfiles:
TERM_NAME
L broth (también fue anotado por el curador)
MOPS (anotado por el curador como <Med> MOPS minimal glucose media </Med>)
glucose (también fue anotado por el curador, pero no como palabra aislada)
nitrate (también fue anotado por el curador, aislado como <Supp> nitrate </Supp> y también como parte de varios suplementos anotados por el curador)
M9 minimal medium (también fue anotado por el curador)
OD600 of 0.3 (también fue anotado por el curador)
Escherichia coli (no estamos considerando organismos)
LB medium (no anotado por el curador)
'''
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = argparse.ArgumentParser(
prog='check_mco_terms_in_sentences_v1-py',
description='Check if MCO terms appear in raw sentences from extracted sentences from softfiles.',
epilog='')
parser.add_argument("--inputPath", dest="inputPath",
help="Path to extracted sentences from softfiles", metavar="PATH")
parser.add_argument("--inputFile", dest="inputFile",
help="Input extracted sentences from softfiles", metavar="FILE")
parser.add_argument("--outputPath", dest="outputPath",
help="Path to place MCO terms that appeared in input file", metavar="PATH")
parser.add_argument("--inputPathMco", dest="inputPathMco",
help="Path to MCO file", metavar="PATH")
parser.add_argument("--inputFileMco", dest="inputFileMco",
help="MCO file", metavar="FILE")
args = parser.parse_args()
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to extracted sentences from softfiles: " + args.inputPath)
print("Input extracted sentences from softfiles: " + args.inputFile)
print("Path to place MCO terms that appeared in input file: " + args.outputPath)
print("Path to MCO file: " + args.inputPathMco)
print("MCO file: " + args.inputFileMco)
print('-------------------------------- PROCESSING --------------------------------')
df_sentences_to_check = pd.read_csv(os.path.join(args.inputPath, args.inputFile))
print(df_sentences_to_check.head(3))
print(df_sentences_to_check.shape)
df_mco_terms = pd.read_csv(os.path.join(args.inputPathMco, args.inputFileMco), sep="\t")
print(df_mco_terms.head(3))
print(df_mco_terms.shape)
df_mco_terms_found = pd.DataFrame(columns=['TERM_TYPE', 'TERM_NAME', 'SENTENCE'])
text_sentences = []
for ind in df_sentences_to_check.index:
line_trans = df_sentences_to_check['transformed_sentence'][ind]
list_line = line_trans.split()
list_sentence = [tokens.split("|")[0] for tokens in list_line]
text_sentence = " ".join(list_sentence)
# print(text_sentence)
if text_sentence not in text_sentences:
text_sentences.append(text_sentence)
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt')
with open(os.path.join(args.outputPath, "MCO_terms_found.tsv"), mode='w') as ofile:
for ind in df_mco_terms.index:
term_type = df_mco_terms['TERM_TYPE'][ind]
term_name = df_mco_terms['TERM_NAME'][ind]
doc = nlp(term_name)
word_list = [w.text for w in doc.sentences[0].words]
term_name_new = " ".join(word_list)
#print(term_name_new)
sentences_found = [sent for sent in text_sentences if term_name_new in sent]
for s in sentences_found:
print("TERM_TYPE {} TERM_NAME {} SENT {}".format(term_type, term_name, s))
new_row = {'TERM_TYPE': term_type,
'TERM_NAME': term_name,
'SENTENCE': s}
df_mco_terms_found = df_mco_terms_found.append(new_row, ignore_index=True)
df_mco_terms_found.to_csv(os.path.join(args.outputPath, 'MCO_terms_found_in_softfiles.tsv'), sep="\t")
......@@ -15,11 +15,11 @@ import pandas as pd
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles.py
# python extract-sentences-from-softfiles_v2.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
##########################################
# MAIN PROGRAM #
......
import stanza
import argparse
import re
import os
import pandas as pd
# Objective
# Sentences extraction from XML Soft files. _v3 includes dictionary-based NER of MCO conditions
#
# Input parameters
# --inputPath=PATH Path to XML Soft files
# --outputPath=PATH Path to place output files
#
# Output
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles_v2.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# _v3
# python extract-sentences-from-softfiles_v3.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = argparse.ArgumentParser(
prog='extract-sentences-from-softfiles',
description='Sentences extraction from XML Soft files.',
epilog='')
parser.add_argument("--inputPath", dest="inputPath",
help="Path to XML Soft files", metavar="PATH")
parser.add_argument("--outputPath", dest="outputPath",
help="Path for output files", metavar="PATH")
parser.add_argument("--inputPathMco", dest="inputPathMco",
help="Path to MCO file", metavar="PATH")
parser.add_argument("--inputFileMco", dest="inputFileMco",
help="MCO file", metavar="FILE")
args = parser.parse_args()
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to XML Soft files: " + args.inputPath)
print("Path to output files: " + args.outputPath)
print("Path to MCO file: " + args.inputPathMco)
print("MCO file: " + args.inputFileMco)
print('-------------------------------- PROCESSING --------------------------------')
## Tags of GCs into consideration
# culture medium, medium supplements, aeration, temperature,
# pH, agitation, growth phase, optical density, genetic background
tags = {
'<Gtype>': 'Gtype',
# '<Gversion>': 'Gversion',
'<Med>': 'Med',
'<Phase>': 'Phase',
# '<Substrain>': 'Substrain',
'<Supp>': 'Supp',
# '<Strain>': 'Strain',
# '<Technique>': 'Technique',
'<Temp>': 'Temp',
'<OD>': 'OD',
'<Anti>': 'Anti',
'<Agit>': 'Agit',
'<Air>': 'Air',
'<Vess>': 'Vess',
'<pH>': 'pH'
}
#tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
# '<Temp>', '<OD>', '<Anti>', '<Agit>',
# '<Air>', '<Vess>', '<pH>']
#deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
tags = ['Gtype', 'Med', 'Phase', 'Supp',
'Temp', 'OD', 'Anti', 'Agit',
'Air', 'Vess', 'pH']
deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique', 'Orgn']
all_tags = tags + deleted_tags
# Regex to check if line has a tag
regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
# Regex to delete tags
regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
# Regex to substitute tags
regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
#p = re.compile(r'blue (?P<animal>dog|cat)')
#p.sub(r'gray \g<animal>', s)
# Regex to tag GCs
regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
# Testing file: GSE54899_family_retagged-05242019_validated.xml
testing_file = "GSE54899_family_retagged-05242019_validated.xml"
# Define stanza pipeline for sentence segmentation
# nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
# Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
# nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
# Define stanza pipeline for lemmatization and pos tagging with sentence segmentation
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
# Store field_name (bangline) and field_text
field_name = ""
field_text = ""
# Store list of unique field_name
hash_field_name = {}
# Store sentences from fields that contained at least one GC tag.
# We want to use this list for someone to check it
df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
# Store serie number
# ^SERIES = GSE54899
serie = ""
# Store series pubmed id
# !Series_pubmed_id = 25222563
serie_pubmed_id = ""
# Store sample
# ^SAMPLE = GSM1326335
sample = ""
for path, dirs, files in os.walk(args.inputPath):
# For each file in dir
for file in files:
# if file == testing_file:
print(" Reading file..." + str(file))
with open(os.path.join(args.inputPath, file)) as iFile:
for line in iFile:
line = line.rstrip('\n')
if line.find(" = ") == -1:
continue
list_line = line.split(" = ")
field_name = list_line[0]
#print("field_name: {}".format(field_name))
field_text = list_line[1]
#print("field_text: {}".format(field_text))
if field_name == "^SERIES":
serie = field_text
elif field_name == "!Series_pubmed_id":
serie_pubmed_id = field_text
elif field_name == "^SAMPLE":
sample = field_text
elif regex_has_tag.search(line): # Contains GC tag
if field_name in hash_field_name:
hash_field_name[field_name] += 1
else:
hash_field_name[field_name] = 1
# original_sentence = field_text
# delete GC tags
modified_sentence = regex_delete_tag.sub("", field_text)
modified_sentence = regex_delete_tag.sub("", modified_sentence)
# substitute tags
# p = re.compile(r'blue (?P<animal>dog|cat)')
# p.sub(r'gray \g<animal>', s)
modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
doc = nlp(modified_sentence)
for i, sentence in enumerate(doc.sentences):
# print(sentence.text)
list_transformed_sentence = []
# For GC tag
gc_tag = "O"
in_tag = False
for word in sentence.words:
result = regex_gc_ini_tag.match(word.text)
if result:
gc_tag = result.group("tag")
in_tag = True
continue
else:
result = regex_gc_end_tag.match(word.text)
if result:
gc_tag = "O"
in_tag = False
continue
else:
if not in_tag:
gc_tag = "O"
list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
transformed_sentence = " ".join(list_transformed_sentence)
original_sentence = regex_gc_ini_tag.sub(r'<\g<tag>>', sentence.text)
original_sentence = regex_gc_end_tag.sub(r'</\g<tag>>', original_sentence)
new_row = {'serie': serie,
'serie_pubmed_id': serie_pubmed_id,
'sample': sample,
'field_name': field_name,
'original_sentence': original_sentence,
'modified_sentence': sentence.text,
'transformed_sentence': transformed_sentence}
df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
This diff could not be displayed because it is too large.
import pandas as pd
import os
def transform_sentence_to_check_to_XML(inputPath, outputPath, inputFile, outputFile):
df_sentences_to_check = pd.read_csv(os.path.join(inputPath, inputFile))
df_sentences_to_check.rename(columns={'Unnamed: 0': 'row'}, inplace=True)
df_sentences_to_check = df_sentences_to_check.sort_values(by=['original_sentence'])
print(df_sentences_to_check.head(5))
with open(os.path.join(outputPath, outputFile), mode='w') as ofile:
ofile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
ofile.write('<gcs_to_check xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="esquema-gcs-to-check.xsd">\n')
for ind in df_sentences_to_check.index:
#
line = '<row id="{}">\n'.format(df_sentences_to_check['row'][ind])
line = line + "\t<serie>{}</serie>\n".format(df_sentences_to_check['serie'][ind])
line = line + "\t<serie_pubmed_id>{}</serie_pubmed_id>\n".format(df_sentences_to_check['serie_pubmed_id'][ind])
line = line + "\t<sample>{}</sample>\n".format(df_sentences_to_check['sample'][ind])
line = line + "\t<field_name>{}</field_name>\n".format(df_sentences_to_check['field_name'][ind])
line = line + "\t<original_sentence>{}</original_sentence>\n".format(df_sentences_to_check['original_sentence'][ind])
line = line + "\t<corrected_sentence>{}</corrected_sentence>\n".format(df_sentences_to_check['original_sentence'][ind])
line = line + "</row>\n"
ofile.write(line)
ofile.write('</gcs_to_check>\n')
transform_sentence_to_check_to_XML(inputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences',
outputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences',
inputFile='geo_sentences_to_check_fixed.csv',
#inputFile='geo_sentences_to_check.csv',
outputFile='geo_sentences_to_check_fixed.xml'
)
\ No newline at end of file