check_mco_terms_in_sentences_v1.py 5.89 KB
import stanza
import argparse
import re
import os
import pandas as pd

# Objective
# Check if MCO terms appear in raw sentences from extracted sentences from softfiles
#
# Input parameters
# --inputPath=PATH    		Path to geo_sentences_to_check_fixed.csv
    # /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputFile=PATH    		File geo_sentences_to_check_fixed.csv
# --inputPathMco            Path to MCO term file
    # /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco            File with MCO terms GC_Terms.txt (tsv)
# --outputPath=PATH   		Path to place MCO terms that appeared in input file
#
# Output
# Files with MCO terms that appeared in input file
#
# _v1
# python check_mco_terms_in_sentences_v1.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputFile geo_sentences_to_check_fixed.csv
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco GC_Terms.txt
# python check_mco_terms_in_sentences_v1.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputFile geo_sentences_to_check_fixed.csv --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb --inputFileMco GC_Terms.txt

'''
RESULTADO DE LA REVISIÓN:
Sólo se encontraron los siguientes términos en las oraciones extraídas de la curación de los softfiles:
TERM_NAME
L broth (también fue anotado por el curador)
MOPS    (anotado por el curador como <Med> MOPS minimal glucose media </Med>)
glucose (también fue anotado por el curador, pero no como palabra aislada)
nitrate (también fue anotado por el curador, aislado como <Supp> nitrate </Supp> y también como parte de varios suplementos anotados por el curador)
M9 minimal medium   (también fue anotado por el curador)
OD600 of 0.3    (también fue anotado por el curador)
Escherichia coli    (no estamos considerando organismos)
LB medium   (no anotado por el curador)
'''

##########################################
#               MAIN PROGRAM             #
##########################################

if __name__ == "__main__":
    # Defining parameters
    parser = argparse.ArgumentParser(
        prog='check_mco_terms_in_sentences_v1-py',
        description='Check if MCO terms appear in raw sentences from extracted sentences from softfiles.',
        epilog='')
    parser.add_argument("--inputPath", dest="inputPath",
                        help="Path to extracted sentences from softfiles", metavar="PATH")
    parser.add_argument("--inputFile", dest="inputFile",
                        help="Input extracted sentences from softfiles", metavar="FILE")
    parser.add_argument("--outputPath", dest="outputPath",
                        help="Path to place MCO terms that appeared in input file", metavar="PATH")
    parser.add_argument("--inputPathMco", dest="inputPathMco",
                        help="Path to MCO file", metavar="PATH")
    parser.add_argument("--inputFileMco", dest="inputFileMco",
                        help="MCO file", metavar="FILE")
    args = parser.parse_args()

    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to extracted sentences from softfiles: " + args.inputPath)
    print("Input extracted sentences from softfiles: " + args.inputFile)
    print("Path to place MCO terms that appeared in input file: " + args.outputPath)
    print("Path to MCO file: " + args.inputPathMco)
    print("MCO file: " + args.inputFileMco)
    print('-------------------------------- PROCESSING --------------------------------')

    df_sentences_to_check = pd.read_csv(os.path.join(args.inputPath, args.inputFile))
    print(df_sentences_to_check.head(3))
    print(df_sentences_to_check.shape)

    df_mco_terms = pd.read_csv(os.path.join(args.inputPathMco, args.inputFileMco), sep="\t")
    print(df_mco_terms.head(3))
    print(df_mco_terms.shape)

    df_mco_terms_found = pd.DataFrame(columns=['TERM_TYPE', 'TERM_NAME', 'SENTENCE'])

    text_sentences = []
    for ind in df_sentences_to_check.index:
        line_trans = df_sentences_to_check['transformed_sentence'][ind]
        list_line = line_trans.split()
        list_sentence = [tokens.split("|")[0] for tokens in list_line]
        text_sentence = " ".join(list_sentence)
        # print(text_sentence)
        if text_sentence not in text_sentences:
            text_sentences.append(text_sentence)

    nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt')

    with open(os.path.join(args.outputPath, "MCO_terms_found.tsv"), mode='w') as ofile:
        for ind in df_mco_terms.index:
            term_type = df_mco_terms['TERM_TYPE'][ind]
            term_name = df_mco_terms['TERM_NAME'][ind]
            doc = nlp(term_name)
            word_list = [w.text for w in doc.sentences[0].words]
            term_name_new = " ".join(word_list)
            #print(term_name_new)
            sentences_found = [sent for sent in text_sentences if term_name_new in sent]
            for s in sentences_found:
                print("TERM_TYPE {} TERM_NAME {} SENT {}".format(term_type, term_name, s))
                new_row = {'TERM_TYPE': term_type,
                           'TERM_NAME': term_name,
                           'SENTENCE': s}
                df_mco_terms_found = df_mco_terms_found.append(new_row, ignore_index=True)
    df_mco_terms_found.to_csv(os.path.join(args.outputPath, 'MCO_terms_found_in_softfiles.tsv'), sep="\t")