extract-sentences-from-softfiles_v3.py 9.72 KB

Raw Blame History Permalink

import stanza
import argparse
import re
import os
import pandas as pd

# Objective
# Sentences extraction from XML Soft files. _v3 includes dictionary-based NER of MCO conditions
#
# Input parameters
# --inputPath=PATH    		Path to XML Soft files
# --outputPath=PATH   		Path to place output files
#
# Output
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles_v2.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles_v2.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences

# _v3
# python extract-sentences-from-softfiles_v3.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco

##########################################
#               MAIN PROGRAM             #
##########################################

if __name__ == "__main__":
    # Defining parameters
    parser = argparse.ArgumentParser(
        prog='extract-sentences-from-softfiles',
        description='Sentences extraction from XML Soft files.',
        epilog='')
    parser.add_argument("--inputPath", dest="inputPath",
                        help="Path to XML Soft files", metavar="PATH")
    parser.add_argument("--outputPath", dest="outputPath",
                        help="Path for output files", metavar="PATH")
    parser.add_argument("--inputPathMco", dest="inputPathMco",
                        help="Path to MCO file", metavar="PATH")
    parser.add_argument("--inputFileMco", dest="inputFileMco",
                        help="MCO file", metavar="FILE")
    args = parser.parse_args()

    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to XML Soft files: " + args.inputPath)
    print("Path to output files: " + args.outputPath)
    print("Path to MCO file: " + args.inputPathMco)
    print("MCO file: " + args.inputFileMco)
    print('-------------------------------- PROCESSING --------------------------------')

    ## Tags of GCs into consideration
    # culture medium, medium supplements, aeration, temperature,
    # pH, agitation, growth phase, optical density, genetic background
    tags = {
        '<Gtype>': 'Gtype',
        # '<Gversion>': 'Gversion',
        '<Med>': 'Med',
        '<Phase>': 'Phase',
        # '<Substrain>': 'Substrain',
        '<Supp>': 'Supp',
        # '<Strain>': 'Strain',
        # '<Technique>': 'Technique',
        '<Temp>': 'Temp',
        '<OD>': 'OD',
        '<Anti>': 'Anti',
        '<Agit>': 'Agit',
        '<Air>': 'Air',
        '<Vess>': 'Vess',
        '<pH>': 'pH'
    }
    #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
    #        '<Temp>', '<OD>', '<Anti>', '<Agit>',
    #        '<Air>', '<Vess>', '<pH>']
    #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
    tags = ['Gtype', 'Med', 'Phase', 'Supp',
            'Temp', 'OD', 'Anti', 'Agit',
            'Air', 'Vess', 'pH']
    deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique', 'Orgn']
    all_tags = tags + deleted_tags
    # Regex to check if line has a tag
    regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
    # Regex to delete tags
    regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
    # Regex to substitute tags
    regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
    regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
    #p = re.compile(r'blue (?P<animal>dog|cat)')
    #p.sub(r'gray \g<animal>', s)
    # Regex to tag GCs
    regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
    regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')

    # Testing file: GSE54899_family_retagged-05242019_validated.xml
    testing_file = "GSE54899_family_retagged-05242019_validated.xml"

    # Define stanza pipeline for sentence segmentation
    # nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
    # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
    # nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)
    # Define stanza pipeline for lemmatization and pos tagging with sentence segmentation
    nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma')

    # Store field_name (bangline) and field_text
    field_name = ""
    field_text = ""

    # Store list of unique field_name
    hash_field_name = {}

    # Store sentences from fields that contained at least one GC tag.
    #   We want to use this list for someone to check it
    df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])

    # Store serie number
    #   ^SERIES = GSE54899
    serie = ""
    # Store series pubmed id
    #   !Series_pubmed_id = 25222563
    serie_pubmed_id = ""
    # Store sample
    #   ^SAMPLE = GSM1326335
    sample = ""

    for path, dirs, files in os.walk(args.inputPath):
        # For each file in dir
        for file in files:
            # if file == testing_file:
            print("   Reading file..." + str(file))
            with open(os.path.join(args.inputPath, file)) as iFile:
                for line in iFile:
                    line = line.rstrip('\n')
                    if line.find(" = ") == -1:
                        continue
                    list_line = line.split(" = ")
                    field_name = list_line[0]
                    #print("field_name: {}".format(field_name))
                    field_text = list_line[1]
                    #print("field_text: {}".format(field_text))
                    if field_name == "^SERIES":
                        serie = field_text
                    elif field_name == "!Series_pubmed_id":
                        serie_pubmed_id = field_text
                    elif field_name == "^SAMPLE":
                        sample = field_text
                    elif regex_has_tag.search(line):    # Contains GC tag
                        if field_name in hash_field_name:
                            hash_field_name[field_name] += 1
                        else:
                            hash_field_name[field_name] = 1
                        # original_sentence = field_text
                        # delete GC tags
                        modified_sentence = regex_delete_tag.sub("", field_text)
                        modified_sentence = regex_delete_tag.sub("", modified_sentence)
                        # substitute tags
                        # p = re.compile(r'blue (?P<animal>dog|cat)')
                        # p.sub(r'gray \g<animal>', s)
                        modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
                        modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
                        doc = nlp(modified_sentence)
                        for i, sentence in enumerate(doc.sentences):
                            # print(sentence.text)
                            list_transformed_sentence = []
                            # For GC tag
                            gc_tag = "O"
                            in_tag = False
                            for word in sentence.words:
                                result = regex_gc_ini_tag.match(word.text)
                                if result:
                                    gc_tag = result.group("tag")
                                    in_tag = True
                                    continue
                                else:
                                    result = regex_gc_end_tag.match(word.text)
                                    if result:
                                        gc_tag = "O"
                                        in_tag = False
                                        continue
                                    else:
                                        if not in_tag:
                                            gc_tag = "O"
                                list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
                            transformed_sentence = " ".join(list_transformed_sentence)
                            original_sentence = regex_gc_ini_tag.sub(r'<\g<tag>>', sentence.text)
                            original_sentence = regex_gc_end_tag.sub(r'</\g<tag>>', original_sentence)
                            new_row = {'serie': serie,
                                       'serie_pubmed_id': serie_pubmed_id,
                                       'sample': sample,
                                       'field_name': field_name,
                                       'original_sentence': original_sentence,
                                       'modified_sentence': sentence.text,
                                       'transformed_sentence': transformed_sentence}
                            df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
    df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))