Nuevo procesamiento para reentrenamiento

cmendezc
Commit ecabc02573ad1ee2c1cf62c6e8d0b8e116c4a63c ecabc025 1 parent 12e7dd61
Showing 6 changed files with 344 additions and 2 deletions
data-sets/bin/check_mco_terms_in_sentences_v1.py
data-sets/bin/extract-sentences-from-softfiles_v1.py
data-sets/bin/extract-sentences-from-softfiles_v2.py
data-sets/bin/extract-sentences-from-softfiles_v3.py
data-sets/bin/mco_terms.csv
data-sets/bin/transform_sentences_to_check_to_XML.py
--- a/data-sets/bin/check_mco_terms_in_sentences_v1.py 0 → 100644
View file @ecabc02
+++ b/data-sets/bin/check_mco_terms_in_sentences_v1.py 0 → 100644
View file @ecabc02
+ import stanza
+ import argparse
+ import re
+ import os
+ import pandas as pd
+ 
+ # Objective
+ # Check if MCO terms appear in raw sentences from extracted sentences from softfiles
+ #
+ # Input parameters
+ # --inputPath=PATH    		Path to geo_sentences_to_check_fixed.csv
+     # /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ # --inputFile=PATH    		File geo_sentences_to_check_fixed.csv
+ # --inputPathMco            Path to MCO term file
+     # /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
+ # --inputFileMco            File with MCO terms GC_Terms.txt (tsv)
+ # --outputPath=PATH   		Path to place MCO terms that appeared in input file
+ #
+ # Output
+ # Files with MCO terms that appeared in input file
+ #
+ # _v1
+ # python check_mco_terms_in_sentences_v1.py
+ # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ # --inputFile geo_sentences_to_check_fixed.csv
+ # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ # --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
+ # --inputFileMco GC_Terms.txt
+ # python check_mco_terms_in_sentences_v1.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputFile geo_sentences_to_check_fixed.csv --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb --inputFileMco GC_Terms.txt
+ 
+ '''
+ RESULTADO DE LA REVISIÓN:
+ Sólo se encontraron los siguientes términos en las oraciones extraídas de la curación de los softfiles:
+ TERM_NAME
+ L broth (también fue anotado por el curador)
+ MOPS    (anotado por el curador como <Med> MOPS minimal glucose media </Med>)
+ glucose (también fue anotado por el curador, pero no como palabra aislada)
+ nitrate (también fue anotado por el curador, aislado como <Supp> nitrate </Supp> y también como parte de varios suplementos anotados por el curador)
+ M9 minimal medium   (también fue anotado por el curador)
+ OD600 of 0.3    (también fue anotado por el curador)
+ Escherichia coli    (no estamos considerando organismos)
+ LB medium   (no anotado por el curador)
+ '''
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = argparse.ArgumentParser(
+         prog='check_mco_terms_in_sentences_v1-py',
+         description='Check if MCO terms appear in raw sentences from extracted sentences from softfiles.',
+         epilog='')
+     parser.add_argument("--inputPath", dest="inputPath",
+                         help="Path to extracted sentences from softfiles", metavar="PATH")
+     parser.add_argument("--inputFile", dest="inputFile",
+                         help="Input extracted sentences from softfiles", metavar="FILE")
+     parser.add_argument("--outputPath", dest="outputPath",
+                         help="Path to place MCO terms that appeared in input file", metavar="PATH")
+     parser.add_argument("--inputPathMco", dest="inputPathMco",
+                         help="Path to MCO file", metavar="PATH")
+     parser.add_argument("--inputFileMco", dest="inputFileMco",
+                         help="MCO file", metavar="FILE")
+     args = parser.parse_args()
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path to extracted sentences from softfiles: " + args.inputPath)
+     print("Input extracted sentences from softfiles: " + args.inputFile)
+     print("Path to place MCO terms that appeared in input file: " + args.outputPath)
+     print("Path to MCO file: " + args.inputPathMco)
+     print("MCO file: " + args.inputFileMco)
+     print('-------------------------------- PROCESSING --------------------------------')
+ 
+     df_sentences_to_check = pd.read_csv(os.path.join(args.inputPath, args.inputFile))
+     print(df_sentences_to_check.head(3))
+     print(df_sentences_to_check.shape)
+ 
+     df_mco_terms = pd.read_csv(os.path.join(args.inputPathMco, args.inputFileMco), sep="\t")
+     print(df_mco_terms.head(3))
+     print(df_mco_terms.shape)
+ 
+     df_mco_terms_found = pd.DataFrame(columns=['TERM_TYPE', 'TERM_NAME', 'SENTENCE'])
+ 
+     text_sentences = []
+     for ind in df_sentences_to_check.index:
+         line_trans = df_sentences_to_check['transformed_sentence'][ind]
+         list_line = line_trans.split()
+         list_sentence = [tokens.split("|")[0] for tokens in list_line]
+         text_sentence = " ".join(list_sentence)
+         # print(text_sentence)
+         if text_sentence not in text_sentences:
+             text_sentences.append(text_sentence)
+ 
+     nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt')
+ 
+     with open(os.path.join(args.outputPath, "MCO_terms_found.tsv"), mode='w') as ofile:
+         for ind in df_mco_terms.index:
+             term_type = df_mco_terms['TERM_TYPE'][ind]
+             term_name = df_mco_terms['TERM_NAME'][ind]
+             doc = nlp(term_name)
+             word_list = [w.text for w in doc.sentences[0].words]
+             term_name_new = " ".join(word_list)
+             #print(term_name_new)
+             sentences_found = [sent for sent in text_sentences if term_name_new in sent]
+             for s in sentences_found:
+                 print("TERM_TYPE {} TERM_NAME {} SENT {}".format(term_type, term_name, s))
+                 new_row = {'TERM_TYPE': term_type,
+                            'TERM_NAME': term_name,
+                            'SENTENCE': s}
+                 df_mco_terms_found = df_mco_terms_found.append(new_row, ignore_index=True)
+     df_mco_terms_found.to_csv(os.path.join(args.outputPath, 'MCO_terms_found_in_softfiles.tsv'), sep="\t")
--- a/data-sets/bin/extract-sentences-from-softfiles_v1.py
View file @ecabc02
+++ b/data-sets/bin/extract-sentences-from-softfiles_v1.py
View file @ecabc02
@@ -15,11 +15,11 @@ import pandas as pd
 # Files with sentences obtained from XML Soft files
 #
 # Examples
- # python extract-sentences-from-softfiles.py
+ # python extract-sentences-from-softfiles_v2.py
 # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
 # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
 #
- # python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ # python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
 
 ##########################################
 #               MAIN PROGRAM             #
--- a/data-sets/bin/extract-sentences-from-softfiles_v2.py
View file @ecabc02
+++ b/data-sets/bin/extract-sentences-from-softfiles_v2.py
View file @ecabc02
--- a/data-sets/bin/extract-sentences-from-softfiles_v3.py 0 → 100644
View file @ecabc02
+++ b/data-sets/bin/extract-sentences-from-softfiles_v3.py 0 → 100644
View file @ecabc02
+ import stanza
+ import argparse
+ import re
+ import os
+ import pandas as pd
+ 
+ # Objective
+ # Sentences extraction from XML Soft files. _v3 includes dictionary-based NER of MCO conditions
+ #
+ # Input parameters
+ # --inputPath=PATH    		Path to XML Soft files
+ # --outputPath=PATH   		Path to place output files
+ #
+ # Output
+ # Files with sentences obtained from XML Soft files
+ #
+ # Examples
+ # python extract-sentences-from-softfiles_v2.py
+ # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
+ # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ #
+ # python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ 
+ # _v3
+ # python extract-sentences-from-softfiles_v3.py
+ # --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
+ # --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
+ # --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
+ # --inputFileMco
+ 
+ ##########################################
+ #               MAIN PROGRAM             #
+ ##########################################
+ 
+ if __name__ == "__main__":
+     # Defining parameters
+     parser = argparse.ArgumentParser(
+         prog='extract-sentences-from-softfiles',
+         description='Sentences extraction from XML Soft files.',
+         epilog='')
+     parser.add_argument("--inputPath", dest="inputPath",
+                         help="Path to XML Soft files", metavar="PATH")
+     parser.add_argument("--outputPath", dest="outputPath",
+                         help="Path for output files", metavar="PATH")
+     parser.add_argument("--inputPathMco", dest="inputPathMco",
+                         help="Path to MCO file", metavar="PATH")
+     parser.add_argument("--inputFileMco", dest="inputFileMco",
+                         help="MCO file", metavar="FILE")
+     args = parser.parse_args()
+ 
+     print('-------------------------------- PARAMETERS --------------------------------')
+     print("Path to XML Soft files: " + args.inputPath)
+     print("Path to output files: " + args.outputPath)
+     print("Path to MCO file: " + args.inputPathMco)
+     print("MCO file: " + args.inputFileMco)
+     print('-------------------------------- PROCESSING --------------------------------')
+ 
+     ## Tags of GCs into consideration
+     # culture medium, medium supplements, aeration, temperature,
+     # pH, agitation, growth phase, optical density, genetic background
+     tags = {
+         '<Gtype>': 'Gtype',
+         # '<Gversion>': 'Gversion',
+         '<Med>': 'Med',
+         '<Phase>': 'Phase',
+         # '<Substrain>': 'Substrain',
+         '<Supp>': 'Supp',
+         # '<Strain>': 'Strain',
+         # '<Technique>': 'Technique',
+         '<Temp>': 'Temp',
+         '<OD>': 'OD',
+         '<Anti>': 'Anti',
+         '<Agit>': 'Agit',
+         '<Air>': 'Air',
+         '<Vess>': 'Vess',
+         '<pH>': 'pH'
+     }
+     #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
+     #        '<Temp>', '<OD>', '<Anti>', '<Agit>',
+     #        '<Air>', '<Vess>', '<pH>']
+     #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
+     tags = ['Gtype', 'Med', 'Phase', 'Supp',
+             'Temp', 'OD', 'Anti', 'Agit',
+             'Air', 'Vess', 'pH']
+     deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique', 'Orgn']
+     all_tags = tags + deleted_tags
+     # Regex to check if line has a tag
+     regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
+     # Regex to delete tags
+     regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
+     # Regex to substitute tags
+     regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
+     regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
+     #p = re.compile(r'blue (?P<animal>dog|cat)')
+     #p.sub(r'gray \g<animal>', s)
+     # Regex to tag GCs
+     regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
+     regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')
+ 
+     # Testing file: GSE54899_family_retagged-05242019_validated.xml
+     testing_file = "GSE54899_family_retagged-05242019_validated.xml"
+ 
+     # Define stanza pipeline for sentence segmentation
+     # nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
+     # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
+     # nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
+     # Define stanza pipeline for lemmatization and pos tagging with sentence segmentation
+     nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')
+ 
+     # Store field_name (bangline) and field_text
+     field_name = ""
+     field_text = ""
+ 
+     # Store list of unique field_name
+     hash_field_name = {}
+ 
+     # Store sentences from fields that contained at least one GC tag.
+     #   We want to use this list for someone to check it
+     df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])
+ 
+     # Store serie number
+     #   ^SERIES = GSE54899
+     serie = ""
+     # Store series pubmed id
+     #   !Series_pubmed_id = 25222563
+     serie_pubmed_id = ""
+     # Store sample
+     #   ^SAMPLE = GSM1326335
+     sample = ""
+ 
+     for path, dirs, files in os.walk(args.inputPath):
+         # For each file in dir
+         for file in files:
+             # if file == testing_file:
+             print("   Reading file..." + str(file))
+             with open(os.path.join(args.inputPath, file)) as iFile:
+                 for line in iFile:
+                     line = line.rstrip('\n')
+                     if line.find(" = ") == -1:
+                         continue
+                     list_line = line.split(" = ")
+                     field_name = list_line[0]
+                     #print("field_name: {}".format(field_name))
+                     field_text = list_line[1]
+                     #print("field_text: {}".format(field_text))
+                     if field_name == "^SERIES":
+                         serie = field_text
+                     elif field_name == "!Series_pubmed_id":
+                         serie_pubmed_id = field_text
+                     elif field_name == "^SAMPLE":
+                         sample = field_text
+                     elif regex_has_tag.search(line):    # Contains GC tag
+                         if field_name in hash_field_name:
+                             hash_field_name[field_name] += 1
+                         else:
+                             hash_field_name[field_name] = 1
+                         # original_sentence = field_text
+                         # delete GC tags
+                         modified_sentence = regex_delete_tag.sub("", field_text)
+                         modified_sentence = regex_delete_tag.sub("", modified_sentence)
+                         # substitute tags
+                         # p = re.compile(r'blue (?P<animal>dog|cat)')
+                         # p.sub(r'gray \g<animal>', s)
+                         modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
+                         modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
+                         doc = nlp(modified_sentence)
+                         for i, sentence in enumerate(doc.sentences):
+                             # print(sentence.text)
+                             list_transformed_sentence = []
+                             # For GC tag
+                             gc_tag = "O"
+                             in_tag = False
+                             for word in sentence.words:
+                                 result = regex_gc_ini_tag.match(word.text)
+                                 if result:
+                                     gc_tag = result.group("tag")
+                                     in_tag = True
+                                     continue
+                                 else:
+                                     result = regex_gc_end_tag.match(word.text)
+                                     if result:
+                                         gc_tag = "O"
+                                         in_tag = False
+                                         continue
+                                     else:
+                                         if not in_tag:
+                                             gc_tag = "O"
+                                 list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
+                             transformed_sentence = " ".join(list_transformed_sentence)
+                             original_sentence = regex_gc_ini_tag.sub(r'<\g<tag>>', sentence.text)
+                             original_sentence = regex_gc_end_tag.sub(r'</\g<tag>>', original_sentence)
+                             new_row = {'serie': serie,
+                                        'serie_pubmed_id': serie_pubmed_id,
+                                        'sample': sample,
+                                        'field_name': field_name,
+                                        'original_sentence': original_sentence,
+                                        'modified_sentence': sentence.text,
+                                        'transformed_sentence': transformed_sentence}
+                             df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
+     df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
--- a/data-sets/bin/mco_terms.csv 0 → 100644
View file @ecabc02
+++ b/data-sets/bin/mco_terms.csv 0 → 100644
View file @ecabc02
--- a/data-sets/bin/transform_sentences_to_check_to_XML.py 0 → 100644
View file @ecabc02
+++ b/data-sets/bin/transform_sentences_to_check_to_XML.py 0 → 100644
View file @ecabc02
+ import pandas as pd
+ import os
+ 
+ def transform_sentence_to_check_to_XML(inputPath, outputPath, inputFile, outputFile):
+     df_sentences_to_check = pd.read_csv(os.path.join(inputPath, inputFile))
+     df_sentences_to_check.rename(columns={'Unnamed: 0': 'row'}, inplace=True)
+     df_sentences_to_check = df_sentences_to_check.sort_values(by=['original_sentence'])
+     print(df_sentences_to_check.head(5))
+     with open(os.path.join(outputPath, outputFile), mode='w') as ofile:
+         ofile.write('<?xml version="1.0" encoding="UTF-8"?>\n')
+         ofile.write('<gcs_to_check xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="esquema-gcs-to-check.xsd">\n')
+         for ind in df_sentences_to_check.index:
+             #
+             line = '<row id="{}">\n'.format(df_sentences_to_check['row'][ind])
+             line = line + "\t<serie>{}</serie>\n".format(df_sentences_to_check['serie'][ind])
+             line = line + "\t<serie_pubmed_id>{}</serie_pubmed_id>\n".format(df_sentences_to_check['serie_pubmed_id'][ind])
+             line = line + "\t<sample>{}</sample>\n".format(df_sentences_to_check['sample'][ind])
+             line = line + "\t<field_name>{}</field_name>\n".format(df_sentences_to_check['field_name'][ind])
+             line = line + "\t<original_sentence>{}</original_sentence>\n".format(df_sentences_to_check['original_sentence'][ind])
+             line = line + "\t<corrected_sentence>{}</corrected_sentence>\n".format(df_sentences_to_check['original_sentence'][ind])
+             line = line + "</row>\n"
+             ofile.write(line)
+         ofile.write('</gcs_to_check>\n')
+ 
+ transform_sentence_to_check_to_XML(inputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences',
+                                    outputPath='/home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences',
+                                    inputFile='geo_sentences_to_check_fixed.csv',
+                                    #inputFile='geo_sentences_to_check.csv',
+                                    outputFile='geo_sentences_to_check_fixed.xml'
+                                    )
\ No newline at end of file