check_mco_terms_in_sentences_v1.py
5.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import stanza
import argparse
import re
import os
import pandas as pd
# Objective
# Check if MCO terms appear in raw sentences from extracted sentences from softfiles
#
# Input parameters
# --inputPath=PATH Path to geo_sentences_to_check_fixed.csv
# /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputFile=PATH File geo_sentences_to_check_fixed.csv
# --inputPathMco Path to MCO term file
# /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco File with MCO terms GC_Terms.txt (tsv)
# --outputPath=PATH Path to place MCO terms that appeared in input file
#
# Output
# Files with MCO terms that appeared in input file
#
# _v1
# python check_mco_terms_in_sentences_v1.py
# --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputFile geo_sentences_to_check_fixed.csv
# --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences
# --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb
# --inputFileMco GC_Terms.txt
# python check_mco_terms_in_sentences_v1.py --inputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputFile geo_sentences_to_check_fixed.csv --outputPath /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/data-curation/curated-sentences --inputPathMco /home/cmendezc/Documents/ccg/gitlab-automatic-extraction-growth-conditions/data-sets/mco_regulondb --inputFileMco GC_Terms.txt
'''
RESULTADO DE LA REVISIÓN:
Sólo se encontraron los siguientes términos en las oraciones extraídas de la curación de los softfiles:
TERM_NAME
L broth (también fue anotado por el curador)
MOPS (anotado por el curador como <Med> MOPS minimal glucose media </Med>)
glucose (también fue anotado por el curador, pero no como palabra aislada)
nitrate (también fue anotado por el curador, aislado como <Supp> nitrate </Supp> y también como parte de varios suplementos anotados por el curador)
M9 minimal medium (también fue anotado por el curador)
OD600 of 0.3 (también fue anotado por el curador)
Escherichia coli (no estamos considerando organismos)
LB medium (no anotado por el curador)
'''
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = argparse.ArgumentParser(
prog='check_mco_terms_in_sentences_v1-py',
description='Check if MCO terms appear in raw sentences from extracted sentences from softfiles.',
epilog='')
parser.add_argument("--inputPath", dest="inputPath",
help="Path to extracted sentences from softfiles", metavar="PATH")
parser.add_argument("--inputFile", dest="inputFile",
help="Input extracted sentences from softfiles", metavar="FILE")
parser.add_argument("--outputPath", dest="outputPath",
help="Path to place MCO terms that appeared in input file", metavar="PATH")
parser.add_argument("--inputPathMco", dest="inputPathMco",
help="Path to MCO file", metavar="PATH")
parser.add_argument("--inputFileMco", dest="inputFileMco",
help="MCO file", metavar="FILE")
args = parser.parse_args()
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to extracted sentences from softfiles: " + args.inputPath)
print("Input extracted sentences from softfiles: " + args.inputFile)
print("Path to place MCO terms that appeared in input file: " + args.outputPath)
print("Path to MCO file: " + args.inputPathMco)
print("MCO file: " + args.inputFileMco)
print('-------------------------------- PROCESSING --------------------------------')
df_sentences_to_check = pd.read_csv(os.path.join(args.inputPath, args.inputFile))
print(df_sentences_to_check.head(3))
print(df_sentences_to_check.shape)
df_mco_terms = pd.read_csv(os.path.join(args.inputPathMco, args.inputFileMco), sep="\t")
print(df_mco_terms.head(3))
print(df_mco_terms.shape)
df_mco_terms_found = pd.DataFrame(columns=['TERM_TYPE', 'TERM_NAME', 'SENTENCE'])
text_sentences = []
for ind in df_sentences_to_check.index:
line_trans = df_sentences_to_check['transformed_sentence'][ind]
list_line = line_trans.split()
list_sentence = [tokens.split("|")[0] for tokens in list_line]
text_sentence = " ".join(list_sentence)
# print(text_sentence)
if text_sentence not in text_sentences:
text_sentences.append(text_sentence)
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt')
with open(os.path.join(args.outputPath, "MCO_terms_found.tsv"), mode='w') as ofile:
for ind in df_mco_terms.index:
term_type = df_mco_terms['TERM_TYPE'][ind]
term_name = df_mco_terms['TERM_NAME'][ind]
doc = nlp(term_name)
word_list = [w.text for w in doc.sentences[0].words]
term_name_new = " ".join(word_list)
#print(term_name_new)
sentences_found = [sent for sent in text_sentences if term_name_new in sent]
for s in sentences_found:
print("TERM_TYPE {} TERM_NAME {} SENT {}".format(term_type, term_name, s))
new_row = {'TERM_TYPE': term_type,
'TERM_NAME': term_name,
'SENTENCE': s}
df_mco_terms_found = df_mco_terms_found.append(new_row, ignore_index=True)
df_mco_terms_found.to_csv(os.path.join(args.outputPath, 'MCO_terms_found_in_softfiles.tsv'), sep="\t")