extract-sentences-from-softfiles_v2.py 11.4 KB

Raw Blame History Permalink

import stanza
import argparse
import re
import os
import pandas as pd

# Objective
# Sentences extraction from XML Soft files.
#
# Input parameters
# --inputPath=PATH    		Path to XML Soft files
# --outputPath=PATH   		Path to place output files
#
# Output
# Files with sentences obtained from XML Soft files
#
# Examples
# python extract-sentences-from-softfiles.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
#
# python extract-sentences-from-softfiles.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/tagged-xml-data --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences

##########################################
#               MAIN PROGRAM             #
##########################################

if __name__ == "__main__":
    # Defining parameters
    parser = argparse.ArgumentParser(
        prog='extract-sentences-from-softfiles',
        description='Sentences extraction from XML Soft files.',
        epilog='')
    parser.add_argument("--inputPath", dest="inputPath",
                        help="Path to XML Soft files", metavar="PATH")
    parser.add_argument("--outputPath", dest="outputPath",
                        help="Path for output files", metavar="PATH")

    args = parser.parse_args()

    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to XML Soft files: " + args.inputPath)
    print("Path to output files: " + args.outputPath)
    print('-------------------------------- PROCESSING --------------------------------')

    ## Tags of GCs into consideration
    # culture medium, medium supplements, aeration, temperature,
    # pH, agitation, growth phase, optical density, genetic background
    tags = {
        '<Gtype>': 'Gtype',
        # '<Gversion>': 'Gversion',
        '<Med>': 'Med',
        '<Phase>': 'Phase',
        # '<Substrain>': 'Substrain',
        '<Supp>': 'Supp',
        # '<Strain>': 'Strain',
        # '<Technique>': 'Technique',
        '<Temp>': 'Temp',
        '<OD>': 'OD',
        '<Anti>': 'Anti',
        '<Agit>': 'Agit',
        '<Air>': 'Air',
        '<Vess>': 'Vess',
        '<pH>': 'pH'
    }
    #tags = ['<Gtype>', '<Med>', '<Phase>', '<Supp>',
    #        '<Temp>', '<OD>', '<Anti>', '<Agit>',
    #        '<Air>', '<Vess>', '<pH>']
    #deleted_tags = ['<Gversion>', '<Substrain>', '<Strain>', '<Technique>']
    tags = ['Gtype', 'Med', 'Phase', 'Supp',
            'Temp', 'OD', 'Anti', 'Agit',
            'Air', 'Vess', 'pH']
    deleted_tags = ['Gversion', 'Substrain', 'Strain', 'Technique']
    all_tags = tags + deleted_tags
    # Regex to check if line has a tag
    regex_has_tag = re.compile(r'<(' + '|'.join(all_tags) + r')>')
    # Regex to delete tags
    regex_delete_tag = re.compile(r'</?(' + '|'.join(deleted_tags) + r')>')
    # Regex to substitute tags
    regex_subs_ini_tag = re.compile(r'<(?P<tag>(' + '|'.join(tags) + r'))>')
    regex_subs_end_tag = re.compile(r'</(?P<tag>(' + '|'.join(tags) + r'))>')
    #p = re.compile(r'blue (?P<animal>dog|cat)')
    #p.sub(r'gray \g<animal>', s)
    # Regex to tag GCs
    regex_gc_ini_tag = re.compile(r'INI_(?P<tag>(' + '|'.join(tags) + r'))')
    regex_gc_end_tag = re.compile(r'END_(?P<tag>(' + '|'.join(tags) + r'))')

    # Testing file: GSE54899_family_retagged-05242019_validated.xml
    testing_file = "GSE54899_family_retagged-05242019_validated.xml"

    # Define stanza pipeline for sentence segmentation
    nlp_sentence_segmentation = stanza.Pipeline(lang='en', processors='tokenize')
    # Define stanza pipeline for lemmatization and pos tagging without sentence segmentation
    nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma', tokenize_no_ssplit=True)

    # Store field_name (bangline) and field_text
    field_name = ""
    field_text = ""

    # Store list of unique field_name
    hash_field_name = {}

    # Store sentences from fields that contained at least one GC tag.
    #   We want to use this list for someone to check it
    df_sentences_to_check = pd.DataFrame(columns=['serie', 'serie_pubmed_id', 'sample', 'field_name', 'original_sentence', 'modified_sentence', 'transformed_sentence'])

    # Store serie number
    #   ^SERIES = GSE54899
    serie = ""
    # Store series pubmed id
    #   !Series_pubmed_id = 25222563
    serie_pubmed_id = ""
    # Store sample
    #   ^SAMPLE = GSM1326335
    sample = ""

    for path, dirs, files in os.walk(args.inputPath):
        # For each file in dir
        for file in files:
            if file == testing_file:
                print("   Reading file..." + str(file))
                with open(os.path.join(args.inputPath, file)) as iFile:
                    for line in iFile:
                        line = line.rstrip('\n')
                        if line.find(" = ") == -1:
                            continue
                        list_line = line.split(" = ")
                        field_name = list_line[0]
                        #print("field_name: {}".format(field_name))
                        field_text = list_line[1]
                        #print("field_text: {}".format(field_text))
                        if field_name == "^SERIES":
                            serie = field_text
                        elif field_name == "!Series_pubmed_id":
                            serie_pubmed_id = field_text
                        elif field_name == "^SAMPLE":
                            sample = field_text
                        elif regex_has_tag.search(line):    # Contains GC tag
                            if field_name in hash_field_name:
                                hash_field_name[field_name] += 1
                            else:
                                hash_field_name[field_name] = 1
                            original_sentence = field_text
                            # delete GC tags
                            modified_sentence = regex_delete_tag.sub("", field_text)
                            modified_sentence = regex_delete_tag.sub("", modified_sentence)
                            # substitute tags
                            # p = re.compile(r'blue (?P<animal>dog|cat)')
                            # p.sub(r'gray \g<animal>', s)
                            modified_sentence = regex_subs_ini_tag.sub(r' INI_\g<tag> ', modified_sentence)
                            modified_sentence = regex_subs_end_tag.sub(r' END_\g<tag> ', modified_sentence)
                            doc = nlp(modified_sentence)
                            for i, sentence in enumerate(doc.sentences):
                                # print(sentence.text)
                                list_transformed_sentence = []
                                # For GC tag
                                gc_tag = "O"
                                in_tag = False
                                for word in sentence.words:
                                    result = regex_gc_ini_tag.match(word.text)
                                    if result:
                                        gc_tag = result.group("tag")
                                        in_tag = True
                                        continue
                                    else:
                                        result = regex_gc_end_tag.match(word.text)
                                        if result:
                                            gc_tag = "O"
                                            in_tag = False
                                            continue
                                        else:
                                            if not in_tag:
                                                gc_tag = "O"
                                    list_transformed_sentence.append("{}|{}|{}|{}".format(word.text, word.lemma, word.xpos, gc_tag))
                                transformed_sentence = " ".join(list_transformed_sentence)
                                new_row = {'serie': serie,
                                           'serie_pubmed_id': serie_pubmed_id,
                                           'sample': sample,
                                           'field_name': field_name,
                                           'original_sentence': original_sentence,
                                           'modified_sentence': sentence.text,
                                           'transformed_sentence': transformed_sentence}
                                df_sentences_to_check = df_sentences_to_check.append(new_row, ignore_index=True)
    df_sentences_to_check.to_csv(os.path.join(args.outputPath, 'geo_sentences_to_check.csv'))
                                    #print(token)
    quit()

    ## End of tagging
    out_labels = {
        '</Gtype>': 'O',
        '</Gversion>': 'O',
        '</Med>': 'O',
        '</Phase>': 'O',
        '</Substrain>': 'O',
        '</Supp>': 'O',
        '</Strain>': 'O',
        '</Technique>': 'O',
        '</Temp>': 'O',
        '</OD>': 'O',
        '</Anti>': 'O',
        '</Agit>': 'O',
        '</Air>': 'O',
        '</Vess>': 'O',
        '</pH>': 'O'}
    old_labels = {
        '<Orgn>': 'O',
        '</Orgn>': 'O'
    }

    # Other label
    flag = 'O'
    lista = []
    # First sentence
    sentence = ''
    n = 0
    with open(os.path.join(args.inputPath, args.inputFile), "r") as input_file:
        for line in input_file:
            if len(line.split('\t')) > 1:
                w = line.split('\t')[1]
                if w in in_labels or w in out_labels:
                    # Tagging
                    if w in in_labels.keys(): flag = in_labels[w]
                    if w in out_labels: flag = out_labels[w]
                else:
                    if w == "PGCGROWTHCONDITIONS":
                        n = n + 1
                        words = sentence.split(' ')
                        # End of sentence
                        tags = [tag for tag in words if tag.split('|')[-1] in in_labels.values()]
                        # At least one true-tag on sentence
                        if len(tags) > 0:
                            lista.append(sentence)
                            # New setence
                        sentence = ''
                    elif w not in old_labels.keys():
                        # Building and save tagging sentence
                        sentence = sentence + ' ' + ('|'.join(line.split('\t')[1:args.index]) + '|' + flag + ' ')

    print("Number of sentences with at least one tag: " + str(len(lista)))
    print("Number of sentences from CoreNLP: " + str(n))

    # Split 70 30 training and test sentences
    trainingIndex = random.sample(range(len(lista)), int(len(lista) * .70))
    testIndex = [n for n in range(len(lista)) if n not in trainingIndex]
    print("Number of sentences for training: " + str(len(trainingIndex)))
    print("Number of sentences for test: " + str(len(testIndex)))

    with open(os.path.join(args.outputPath, args.trainingFile), "w") as oFile:
        Data = [lista[i] for i in trainingIndex]
        oFile.write('\n'.join(Data))

    with open(os.path.join(args.outputPath, args.testFile), "w") as oFile:
        Data = [lista[i] for i in testIndex]
        oFile.write('\n'.join(Data))

    print("==================================END===================================")