Estefani Gaytan Nunez

upload

Showing 56 changed files with 2812 additions and 3 deletions
...@@ -57,6 +57,12 @@ if (!length(opt)){ ...@@ -57,6 +57,12 @@ if (!length(opt)){
57 ## Input files and output directories 57 ## Input files and output directories
58 infoFile <- opt$infoFile 58 infoFile <- opt$infoFile
59 59
60 +if (!"gse" %in% names(gseInfo)){
61 + stop("include at least gse column")
62 +}
63 +if (!"gsm" %in% names(gseInfo)){
64 + gseInfor$gsm <- "GSM"
65 +}
60 66
61 ## Load main variables 67 ## Load main variables
62 68
...@@ -89,4 +95,4 @@ for (geoid in unique(gseInfo$gse)) { ...@@ -89,4 +95,4 @@ for (geoid in unique(gseInfo$gse)) {
89 } 95 }
90 cat("download id: ", length(list.dirs(opt$downloadPath, recursive = FALSE))) 96 cat("download id: ", length(list.dirs(opt$downloadPath, recursive = FALSE)))
91 97
92 -message("Required GSE: ", ngse_down)
...\ No newline at end of file ...\ No newline at end of file
98 +message("Required GSE: ", ngse_down)
......
...@@ -42,6 +42,11 @@ def get_crossref_info(info_df): ...@@ -42,6 +42,11 @@ def get_crossref_info(info_df):
42 - **pmid**: PubMed ID 42 - **pmid**: PubMed ID
43 """ 43 """
44 def get_cite_info(info_df): 44 def get_cite_info(info_df):
45 + if(info_df.CASE_MATCH == "ZIKA"):
46 + cite_dict ={
47 + "evidence_id":"",
48 + "evidence_name":"ZIKA",
49 + "pmid" : info_df.PMID}
45 cite_dict ={ 50 cite_dict ={
46 "evidence_id": "", 51 "evidence_id": "",
47 "evidence_name" : "NPL-CRF", #NPL 52 "evidence_name" : "NPL-CRF", #NPL
...@@ -49,6 +54,8 @@ def get_cite_info(info_df): ...@@ -49,6 +54,8 @@ def get_cite_info(info_df):
49 } 54 }
50 return(cite_dict) 55 return(cite_dict)
51 def get_description(info_df): 56 def get_description(info_df):
57 + if(info_df.CASE_MATCH=="ZIKA"):
58 + mco_mapping = {}
52 if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100: 59 if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
53 mco_mapping = { 60 mco_mapping = {
54 "type": "term present on MCO" 61 "type": "term present on MCO"
......
1 +from numpy import nan
2 +from collections import OrderedDict
3 +from pandas import DataFrame as DF
4 +"""
5 + - **name**: nombre del termino registrado en la MCO
6 + - **term_id**: identificador del termino en RegulonDB (si existe)
7 + - **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
8 + - **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
9 + - **source**: fuente de los datos [ GEO, ]
10 + - **id**: identificador del registro de la base de datos o fuente de datos
11 + - **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
12 + - **associatedPhrase**: Frase de donde se tomo la informacion
13 +"""
14 +def get_term_info(info_df, source):
15 + term_dict = {
16 + "name": info_df.TERM_NAME, #NPL output
17 + "term_id" : info_df.TERM_ID, #MCO
18 + "term_type": info_df.TERM_TYPE, #NPL
19 + "source_data": info_df.REPO_FILE, #NPL
20 + "source": source,
21 + "id": info_df.GSM, #NPL
22 + "field": info_df.BANGLINE, #NPL
23 + "associatedPhrase": info_df.FULL_TEXT #NPL
24 + }
25 + return(term_dict)
26 +
27 +
28 +"""
29 + - **objectId**: Identificador en la base de datos fuente
30 + - **externalCrossReferences_name**: nombre de la DB [ GEO ]
31 +"""
32 +def get_crossref_info(info_df, source):
33 + crossref_dict ={
34 + "objectId": info_df.GSM, #NPL
35 + "externalCrossReferences_name" : source
36 + }
37 + return(crossref_dict)
38 +
39 +"""
40 + - **evidence_id**: Identificador de RegulondB asociado a la evidencia
41 + - **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
42 + - **pmid**: PubMed ID
43 +"""
44 +def get_cite_info(info_df, esource):
45 + cite_dict ={
46 + "evidence_id": "",
47 + "evidence_name" : esource,
48 + "pmid": info_df.PMID
49 + }
50 + return(cite_dict)
51 +
52 +def get_description(info_df, no_map=False):
53 + if(no_map):
54 + mco_mapping = {
55 + "type": "not present on MCO"
56 + }
57 + elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
58 + mco_mapping = {
59 + "type": "term present on MCO"
60 + }
61 + else:
62 + mco_mapping = {
63 + "type": "string similarity",
64 + "score": info_df.SET
65 + }
66 + return(mco_mapping)
67 +"""
68 +#run it in the main for each field
69 +
70 +
71 +return: type
72 +
73 +id: string
74 +name: string
75 +description: string
76 +terms: list of dict
77 +externalCrossReferences: list of dict
78 +citations: list of dict
79 +
80 +"""
81 +
82 +def created_record(term_info_df, source = "GEO", no_map = False, esource = "NPL-CRF"):
83 + record_dict = OrderedDict()
84 + term_info_df = term_info_df.replace(nan, '', regex=True)
85 + record_dict["id"] = term_info_df.TERM_ID #it should be add if it have been mapped
86 + record_dict["name"] = term_info_df.TERM_NAME #a colum form NPL output
87 + record_dict["description"] = [get_description(term_info_df, no_map=no_map)]
88 + record_dict["terms"] = [get_term_info(term_info_df, source)]
89 + record_dict["externalCrossReferences"] = [get_crossref_info(term_info_df, source)]
90 + record_dict["citations"] = [get_cite_info(term_info_df, esource)]
91 +
92 + return(record_dict)
93 +
94 +def json2DataFrame(data):
95 + mco_syn_dic = dict()
96 +
97 + for j,i in enumerate(data):
98 + if "regulondb_id" in i.keys():
99 +
100 + if "synonyms" in i.keys():
101 +
102 + for k,syn in enumerate(i['synonyms']):
103 + dict_key = i['regulondb_id']+"_"+str(k)
104 + mco_syn_dic[dict_key] = {
105 + #ENTITY_NAME
106 + 'ENTITY_NAME' : i['name'],
107 + #ENITY_SYN
108 + 'TERM_NAME': syn.lower(),
109 + #regulondb_id
110 + 'TERM_ID' : i['regulondb_id'] }
111 +
112 + elif "hasRelatedSynonyms" in i.keys():
113 +
114 + for k,syn in enumerate(i['hasRelatedSynonyms']):
115 + dict_key = i['regulondb_id']+"_"+str(k)
116 + mco_syn_dic[dict_key] = {
117 + #ENTITY_NAME
118 + 'ENTITY_NAME' : i['name'],
119 + #ENITY_SYN
120 + 'TERM_NAME': syn.lower(),
121 + #regulondb_id
122 + 'TERM_ID' : i['regulondb_id'] }
123 + else:
124 + dict_key = i['regulondb_id']
125 + mco_syn_dic[dict_key] = {
126 + #ENTITY_NAME
127 + 'ENTITY_NAME' : i['name'],
128 + #ENITY_SYN
129 + 'TERM_NAME': '',
130 + #regulondb_id
131 + 'TERM_ID' : i['regulondb_id'] }
132 +
133 + mco_syn_df = DF.from_dict(mco_syn_dic).T
134 +
135 +
136 + return(mco_syn_df)
137 +
1 +from numpy import nan
2 +#from collections import OrderedDict
3 +from pandas import DataFrame as DF
4 +import json
5 +from collections import defaultdict
6 +import format_fun_v6 as format_fun
7 +
8 +def to_json(df, source_info, evidence_source, ofname):
9 + df_terms = defaultdict(list)
10 +
11 + for idx,row in df.iterrows():
12 + term_record = format_fun.get_term_info(
13 + row,
14 + source = source_info,
15 + map= row.MAP)
16 + df_terms[row.SRR].append(term_record)
17 +
18 + df_json = {}
19 + df_tmp = df.drop_duplicates("SRR", keep="first")
20 + for idx,row in df_tmp.iterrows():
21 + srr_record = format_fun.created_record(
22 + info_df = row,
23 + term_list = df_terms[row.SRR],
24 + source = source_info,
25 + esource = evidence_source)
26 + df_json[row.SRR] = srr_record
27 +
28 + with open(ofname, "w") as output:
29 + json.dump(df_json, output, separators=(',', ':'), indent=4)
30 +
31 +def get_score(info_df):
32 + if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
33 + subtext = "term present on MCO"
34 + else:
35 + mco_mapping = {
36 + "type": "string similarity",
37 + "score": info_df.SET
38 + }
39 + return(mco_mapping)
40 +"""
41 + - **name**: nombre del termino registrado en la MCO
42 + - **term_id**: identificador del termino en RegulonDB (si existe)
43 + - **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
44 + - **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
45 + - **source**: fuente de los datos [ GEO, ]
46 + - **id**: identificador del registro de la base de datos o fuente de datos
47 + - **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
48 + - **associatedPhrase**: Frase de donde se tomo la informacion
49 +"""
50 +def get_term_info(info_df, source, map=True):
51 + info_df = info_df.replace(nan, "", regex=True)
52 + term_dict = {
53 + "name": info_df.TERM_NAME, #NPL output
54 + "term_id" : info_df.TERM_ID, #MCO
55 + "term_type": info_df.TERM_TYPE, #NPL
56 + "source_data": {
57 + "source": source,
58 + "id": info_df.GSM, #NPL
59 + "field": info_df.BANGLINE, #NPL
60 + "associatedPhrase": info_df.FULL_TEXT, #NPL
61 + "description" : get_description(info_df, map),
62 + "similarity_percentage" : info_df.SET
63 + }
64 + }
65 + return(term_dict)
66 +
67 +
68 +"""
69 + - **objectId**: Identificador en la base de datos fuente
70 + - **externalCrossReferences_name**: nombre de la DB [ GEO ]
71 +"""
72 +def get_crossref_info(info_df, source):
73 + crossref_dict ={
74 + "objectId": info_df.GSM, #NPL
75 + "externalCrossReferences_name" : source
76 + }
77 + return(crossref_dict)
78 +
79 +"""
80 + - **evidence_id**: Identificador de RegulondB asociado a la evidencia
81 + - **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
82 + - **pmid**: PubMed ID
83 +"""
84 +def get_cite_info(info_df, esource):
85 + cite_dict ={
86 + "evidence_id": "",
87 + "evidence_name" : esource,
88 + "pmid": info_df.PMID
89 + }
90 + return(cite_dict)
91 +
92 +def get_description(info_df, map=True):
93 + if not map:
94 + subtext = "absent in RegulonDB MCO"
95 + elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
96 + subtext = "RegulonDB MCO term"
97 + else:
98 + subtext = "Similar term in RegulonDB MCO"
99 + return(subtext)
100 +"""
101 +#run it in the main for each field
102 +
103 +
104 +return: type
105 +
106 +id: string
107 +name: string
108 +description: string
109 +terms: list of dict
110 +externalCrossReferences: list of dict
111 +citations: list of dict
112 +
113 +"""
114 +
115 +def created_record(info_df, term_list, source = "GEO", esource = "NPL-CRF"):
116 + #record_dict = OrderedDict()
117 + record_dict = {}
118 + info_df = info_df.replace(nan, "", regex=True)
119 + record_dict["id"] = ""
120 + record_dict["name"] = ""
121 + record_dict["description"] = ""
122 + record_dict["terms"] = term_list
123 + record_dict["externalCrossReferences"] = [get_crossref_info(info_df, source)]
124 + record_dict["citations"] = [get_cite_info(info_df, esource)]
125 +
126 + return(record_dict)
127 +
128 +def json2DataFrame(data):
129 + mco_syn_dic = dict()
130 +
131 + for j,i in enumerate(data):
132 + if "regulondb_id" in i.keys():
133 +
134 + if "synonyms" in i.keys():
135 +
136 + for k,syn in enumerate(i['synonyms']):
137 + dict_key = i['regulondb_id']+"_"+str(k)
138 + mco_syn_dic[dict_key] = {
139 + #ENTITY_NAME
140 + 'ENTITY_NAME' : i['name'],
141 + #ENITY_SYN
142 + 'TERM_NAME': syn.lower(),
143 + #regulondb_id
144 + 'TERM_ID' : i['regulondb_id'] }
145 +
146 + elif "hasRelatedSynonyms" in i.keys():
147 +
148 + for k,syn in enumerate(i['hasRelatedSynonyms']):
149 + dict_key = i['regulondb_id']+"_"+str(k)
150 + mco_syn_dic[dict_key] = {
151 + #ENTITY_NAME
152 + 'ENTITY_NAME' : i['name'],
153 + #ENITY_SYN
154 + 'TERM_NAME': syn.lower(),
155 + #regulondb_id
156 + 'TERM_ID' : i['regulondb_id'] }
157 + else:
158 + dict_key = i['regulondb_id']
159 + mco_syn_dic[dict_key] = {
160 + #ENTITY_NAME
161 + 'ENTITY_NAME' : i['name'],
162 + #ENITY_SYN
163 + 'TERM_NAME': '',
164 + #regulondb_id
165 + 'TERM_ID' : i['regulondb_id'] }
166 +
167 + mco_syn_df = DF.from_dict(mco_syn_dic).T
168 +
169 +
170 + return(mco_syn_df)
171 +
1 +# -*- coding: utf-8 -*-
2 +"""
3 +#Setup
4 +"""
5 +
6 +#################### Setup ####################
7 +from collections import defaultdict
8 +from optparse import OptionParser
9 +import os
10 +from numpy.core.fromnumeric import sort
11 +from pandas import read_csv, DataFrame, merge, concat, read_table
12 +from numpy import exp, nan
13 +import seaborn as sns
14 +from numpy import mean
15 +
16 +import matplotlib.pyplot as plt
17 +import matplotlib
18 +matplotlib.style.use('ggplot')
19 +# %matplotlib inline
20 +
21 +from collections import Counter
22 +import json
23 +
24 +from fuzzywuzzy import fuzz
25 +from fuzzywuzzy import process
26 +
27 +import format_fun
28 +import mapping_fun
29 +import sys
30 +
31 +"""
32 +# input parameters
33 +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
34 +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
35 +--iOntoFile gc_ontology_terms_v2.txt
36 +--iLinksFile gc_ontology_terms_link_v2.txt
37 +--iSynFile mco_terms_v0.2.json
38 +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
39 +--outputFile all_srr_IV_mapped.tsv
40 +--minPerMatch 90
41 +
42 +
43 +#Example
44 +# nohup python3 mapping2MCO_v3.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile srr_htregulondb_mapped.tsv --minPerMatch 80 --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
45 +"""
46 +#################### Defining parameters ####################
47 +if __name__ == "__main__":
48 + parser = OptionParser()
49 + parser.add_option(
50 + "--inputPath",
51 + dest="input_path",
52 + help="Path of npl tagged file (crf output)",
53 + metavar="PATH")
54 + parser.add_option(
55 + "--iAnnotatedFile",
56 + dest="npl_fname",
57 + help="Input file of npl tagged file (crf output)",
58 + metavar="FILE",
59 + default="")
60 + parser.add_option(
61 + "--iOntoFile",
62 + dest="onto_fname",
63 + help="Input file with the ontology entities",
64 + metavar="FILE",
65 + default="")
66 + parser.add_option(
67 + "--iLinksFile",
68 + dest="links_fname",
69 + help="Input file with links and id for the ontology",
70 + metavar="FILE",
71 + default=None)
72 + parser.add_option(
73 + "--iSynFile",
74 + dest="syn_fname",
75 + help="Input file for the additional ontology of synonyms",
76 + metavar="FILE",
77 + default=None)
78 + parser.add_option(
79 + "--outputPath",
80 + dest="output_path",
81 + help="Output path to place output files",
82 + metavar="PATH")
83 + parser.add_option(
84 + "--outputFile",
85 + dest="out_fname",
86 + help="Output file name for the mapping process",
87 + metavar="FILE",
88 + default="")
89 + parser.add_option(
90 + "--minPerMatch",
91 + dest="min_score",
92 + help="Minimal string matching percentage")
93 + parser.add_option(
94 + "--minCRFProbs",
95 + dest="min_probs",
96 + help="Minimal crf probabilities")
97 +
98 + (options, args) = parser.parse_args()
99 + if len(args) > 0:
100 + parser.error("Any parameter given.")
101 + sys.exit(1)
102 +
103 + #################### DISP PARAMETERS ####################
104 + print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
105 + print("--inputPath Path of npl tagged file: " + str(options.input_path))
106 + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
107 + print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
108 + print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
109 + print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
110 + print("--outputPath Output path to place output files: " + str(options.output_path))
111 + print("--outputFile Output of the mapping process: " + str(options.out_fname))
112 + print("--minPerMatch Minimal string matching percentage: " + str(options.min_score))
113 + print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs))
114 +
115 + print("\n\n")
116 + repognrl = "http://pakal.ccg.unam.mx/cmendezc"
117 + reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
118 + repo_url = '/'.join([repognrl,reponame])
119 +
120 + # Input files
121 + min_score = int(options.min_score)
122 + min_probs = float(options.min_probs)
123 + npl_ifile = os.path.join(options.input_path, options.npl_fname)
124 + mco_ifile = os.path.join(options.input_path, options.onto_fname)
125 + mco_syn_ifile = os.path.join(options.input_path, options.syn_fname)
126 +
127 + #Output files
128 + raw_ofname = "_".join(["raw", options.out_fname])
129 + rawmap_ofile = os.path.join(options.output_path, raw_ofname)
130 + str_ofname = "_".join(["sim", options.out_fname])
131 + strmap_ofile = os.path.join(options.output_path, str_ofname)
132 +
133 + full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
134 + full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
135 +
136 + json_ofile = os.path.join(options.output_path, options.out_fname)
137 + json_ofile_map = json_ofile.replace(".tsv", "_map.json")
138 + json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
139 + json_ofile_list= json_ofile.replace(".tsv", "_list.json")
140 + json_ofile_df_list= json_ofile.replace(".tsv", "_df_list.json")
141 +
142 + #################### Load input data ####################
143 + # Load CRF-annotation
144 + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
145 + npl_full = read_table(npl_ifile, sep = "\t")
146 +
147 + obs_cols = set(npl_full.columns)
148 +
149 + if exp_cols.intersection(obs_cols) != exp_cols:
150 + ocol = ", ".join(list(exp_cols))
151 + sys.exit(ocol + " expected columns for iAnnotatedFile" )
152 +
153 + #Load MCO term names
154 + exp_cols = {"TERM_ID", "TERM_NAME"}
155 + mco_df_full = read_table(mco_ifile, sep = "\t")
156 + obs_cols = set(mco_df_full.columns)
157 +
158 + if exp_cols.intersection(obs_cols) != exp_cols:
159 + sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
160 +
161 + mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
162 + mco_df = mco_df.drop_duplicates(keep="first")
163 + mco_df = mco_df.dropna()
164 +
165 + #Load MCO links
166 + if options.links_fname is not None:
167 + print("\nLoad types...")
168 + mcolink_ifile = os.path.join(options.input_path, options.links_fname)
169 + exp_cols = {"TERM_ID", "TERM_TYPE"}
170 + mco_links_full = read_table(mcolink_ifile, sep = "\t")
171 +
172 + obs_cols = set(mco_links_full.columns)
173 +
174 + if exp_cols.intersection(obs_cols) != exp_cols:
175 + sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
176 +
177 + mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
178 + mco_links = mco_links.drop_duplicates(keep="first")
179 + mco_links = mco_links.dropna()
180 + else:
181 + mco_links = None
182 +
183 + #Load MCO terms synonyms
184 + #format json from mco to dataframe
185 + mco_json = open(mco_syn_ifile )
186 + data = json.load(mco_json)
187 + mco_syn = format_fun.json2DataFrame(data)
188 +
189 + df_json = defaultdict(list)
190 +
191 + for idx,row in npl_full.iterrows():
192 + record = format_fun.created_record(row)
193 + df_json[row.SRR].append(record)
194 +
195 + df_json
196 + with open(json_ofile_list, "w") as output:
197 + json.dump(format_fun.created_record(df_json), output)
198 +
199 + with open(json_ofile_df_list, "a") as output:
200 + for idx,row in df_json.items():
201 + json.dump(format_fun.created_record(row), output)
...\ No newline at end of file ...\ No newline at end of file
1 +# -*- coding: utf-8 -*-
2 +"""
3 +#Setup
4 +"""
5 +
6 +#################### Setup ####################
7 +from collections import defaultdict
8 +from optparse import OptionParser
9 +import os
10 +from pandas import read_csv, DataFrame, merge, concat, read_table
11 +from numpy import exp, nan, mean
12 +import json
13 +import format_fun_v4 as format_fun
14 +import sys
15 +
16 +"""
17 +# input parameters
18 +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
19 +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
20 +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
21 +--outputFile all_srr_IV_mapped.tsv
22 +
23 +
24 +#Example
25 +# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v4.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v3.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile zika_mapped.json
26 +"""
27 +#################### Defining parameters ####################
28 +if __name__ == "__main__":
29 + parser = OptionParser()
30 + parser.add_option(
31 + "--inputPath",
32 + dest="input_path",
33 + help="Path of npl tagged file (crf output)",
34 + metavar="PATH")
35 + parser.add_option(
36 + "--iAnnotatedFile",
37 + dest="npl_fname",
38 + help="Input file of npl tagged file (crf output)",
39 + metavar="FILE",
40 + default="")
41 + parser.add_option(
42 + "--outputPath",
43 + dest="output_path",
44 + help="Output path to place output files",
45 + metavar="PATH")
46 + parser.add_option(
47 + "--outputFile",
48 + dest="out_fname",
49 + help="Output file name for the mapping process",
50 + metavar="FILE",
51 + default="")
52 +
53 + (options, args) = parser.parse_args()
54 + if len(args) > 0:
55 + parser.error("Any parameter given.")
56 + sys.exit(1)
57 +
58 + #################### DISP PARAMETERS ####################
59 + print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
60 + print("--inputPath Path of npl tagged file: " + str(options.input_path))
61 + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
62 + print("--outputPath Output path to place output files: " + str(options.output_path))
63 + print("--outputFile Output of the mapping process: " + str(options.out_fname))
64 +
65 + print("\n\n")
66 +
67 + # Input files
68 + npl_ifile = os.path.join(options.input_path, options.npl_fname)
69 +
70 + #Output files
71 + ofname = os.path.join(options.output_path, options.out_fname)
72 +
73 + #################### Load input data ####################
74 + # Load CRF-annotation
75 + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
76 + npl_full = read_table(npl_ifile, sep = "\t")
77 +
78 + obs_cols = set(npl_full.columns)
79 +
80 + if exp_cols.intersection(obs_cols) != exp_cols:
81 + ocol = ", ".join(list(exp_cols))
82 + sys.exit(ocol + " expected columns for iAnnotatedFile" )
83 +
84 + df_json = defaultdict(list)
85 +
86 + for idx,row in npl_full.iterrows():
87 + record = format_fun.created_record(row, source = "ZIKAdb", no_map = True, esource = "database")
88 + if(idx<2): print(record)
89 + #record_json = json.dumps(record)
90 + record_json = record
91 + df_json[row.SRR].append(record_json)
92 +
93 + """
94 + with open(ofname, "a") as output:
95 + output.write("field:[")
96 + sep=""
97 + for k,v in df_json.items():
98 + output.write(sep)
99 + json.dump(v, output)
100 + sep=","
101 + output.write("]")
102 +
103 + """
104 + with open(ofname, "a") as output:
105 + output.write("{")
106 + sep=""
107 + for k,v in df_json.items():
108 + output.write(sep)
109 + output.write("\""+k+"\"")
110 + output.write(":")
111 + record_list = {
112 + "growth_conditions": df_json[k]
113 + }
114 + json.dump(record_list, output)
115 + sep=","
116 + output.write("}")
117 +
118 + df=open(ofname)
119 + df=json.load(df)
120 +
1 +# -*- coding: utf-8 -*-
2 +"""
3 +#Setup
4 +"""
5 +
6 +#################### Setup ####################
7 +from optparse import OptionParser
8 +import os
9 +from pandas import read_csv, DataFrame, merge, concat, read_table
10 +from numpy import mean
11 +import format_fun_v6 as format_fun
12 +import sys
13 +
14 +"""
15 +# input parameters
16 +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
17 +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
18 +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
19 +--outputFile all_srr_IV_mapped.tsv
20 +
21 +
22 +#Example
23 +# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile zika.json > automatic-extraction-growth-conditions/mapping_MCO/reports/zika_formated_report.out
24 +# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/test/ --outputFile zika_mapped.json > automatic-extraction-growth-conditions/mapping_MCO/test/zika_mapping_report.out
25 +
26 +"""
27 +#################### Defining parameters ####################
28 +if __name__ == "__main__":
29 + parser = OptionParser()
30 + parser.add_option(
31 + "--inputPath",
32 + dest="input_path",
33 + help="Path of npl tagged file (crf output)",
34 + metavar="PATH")
35 + parser.add_option(
36 + "--iAnnotatedFile",
37 + dest="npl_fname",
38 + help="Input file of npl tagged file (crf output)",
39 + metavar="FILE",
40 + default="")
41 + parser.add_option(
42 + "--outputPath",
43 + dest="output_path",
44 + help="Output path to place output files",
45 + metavar="PATH")
46 + parser.add_option(
47 + "--outputFile",
48 + dest="out_fname",
49 + help="Output file name for the mapping process",
50 + metavar="FILE",
51 + default="")
52 +
53 + (options, args) = parser.parse_args()
54 + if len(args) > 0:
55 + parser.error("Any parameter given.")
56 + sys.exit(1)
57 +
58 + #################### DISP PARAMETERS ####################
59 + print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
60 + print("--inputPath Path of npl tagged file: " + str(options.input_path))
61 + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
62 + print("--outputPath Output path to place output files: " + str(options.output_path))
63 + print("--outputFile Output of the mapping process: " + str(options.out_fname))
64 +
65 + print("\n\n")
66 +
67 + # Input files
68 + npl_ifile = os.path.join(options.input_path, options.npl_fname)
69 +
70 + #Output files
71 + ofname = os.path.join(options.output_path, options.out_fname)
72 +
73 + #################### Load input data ####################
74 + # Load CRF-annotation
75 + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
76 + npl_full = read_table(npl_ifile, sep = "\t")
77 + npl_full = npl_full.drop_duplicates()
78 +
79 + print(f"Total zika terms: {len(npl_full)} ")
80 + obs_cols = set(npl_full.columns)
81 +
82 + if exp_cols.intersection(obs_cols) != exp_cols:
83 + ocol = ", ".join(list(exp_cols))
84 + sys.exit(ocol + " expected columns for iAnnotatedFile" )
85 + """
86 + df_terms = defaultdict(list)
87 +
88 + for idx,row in npl_full.iterrows():
89 + term_record = format_fun.get_term_info(row, source = "ZIKAdb", map=False)
90 + df_terms[row.SRR].append(term_record)
91 +
92 + df_json = {}
93 + df_tmp = npl_full.drop_duplicates("SRR", keep="first")
94 + for idx,row in df_tmp.iterrows():
95 + srr_record = format_fun.created_record(
96 + info_df = row,
97 + term_list = df_terms[row.SRR],
98 + source = "ZIKAdb",
99 + esource = "database")
100 + df_json[row.SRR] = srr_record
101 +
102 + with open(ofname, "w") as output:
103 + json.dump(df_json, output, separators=(',', ':'), indent=4)
104 +
105 + df=open(ofname)
106 + df=json.load(df)
107 + print(df["ERR1399578"])
108 + """
109 + npl_full["MAP"] = False
110 + format_fun.to_json(
111 + df = npl_full,
112 + source_info = "ZIKAdb",
113 + evidence_source = "database",
114 + ofname = ofname)
...\ No newline at end of file ...\ No newline at end of file
...@@ -23,7 +23,8 @@ import json ...@@ -23,7 +23,8 @@ import json
23 from fuzzywuzzy import fuzz 23 from fuzzywuzzy import fuzz
24 from fuzzywuzzy import process 24 from fuzzywuzzy import process
25 25
26 -import format_fun 26 +#import format_fun
27 +import format_fun_v4 as format_fun
27 import mapping_fun 28 import mapping_fun
28 import sys 29 import sys
29 30
...@@ -338,5 +339,6 @@ if __name__ == "__main__": ...@@ -338,5 +339,6 @@ if __name__ == "__main__":
338 with open(json_ofile_unmap, "a") as output: 339 with open(json_ofile_unmap, "a") as output:
339 for idx,row in full_unmap.iterrows(): 340 for idx,row in full_unmap.iterrows():
340 json.dump(format_fun.created_record(row), output) 341 json.dump(format_fun.created_record(row), output)
342 +
343 +
341 344
342 -
...\ No newline at end of file ...\ No newline at end of file
......
1 +# -*- coding: utf-8 -*-
2 +"""
3 +#Setup
4 +"""
5 +
6 +#################### Setup ####################
7 +from optparse import OptionParser
8 +import os
9 +from numpy.core.fromnumeric import sort
10 +from pandas import read_csv, DataFrame, merge, concat, read_table
11 +from numpy import exp, nan
12 +import seaborn as sns
13 +from numpy import mean
14 +
15 +import matplotlib.pyplot as plt
16 +import matplotlib
17 +matplotlib.style.use('ggplot')
18 +# %matplotlib inline
19 +
20 +from collections import Counter, defaultdict
21 +import json
22 +
23 +from fuzzywuzzy import fuzz
24 +from fuzzywuzzy import process
25 +
26 +#import format_fun
27 +import format_fun_v4 as format_fun
28 +import mapping_fun
29 +import sys
30 +
31 +"""
32 +# input parameters
33 +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
34 +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
35 +--iOntoFile gc_ontology_terms_v2.txt
36 +--iLinksFile gc_ontology_terms_link_v2.txt
37 +--iSynFile mco_terms_v0.2.json
38 +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
39 +--outputFile all_srr_IV_mapped.tsv
40 +--minPerMatch 90
41 +
42 +
43 +#Example
44 +# nohup python3 mapping2MCO_v3.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile srr_htregulondb_mapped.tsv --minPerMatch 80 --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
45 +"""
46 +#################### Defining parameters ####################
47 +if __name__ == "__main__":
48 + parser = OptionParser()
49 + parser.add_option(
50 + "--inputPath",
51 + dest="input_path",
52 + help="Path of npl tagged file (crf output)",
53 + metavar="PATH")
54 + parser.add_option(
55 + "--iAnnotatedFile",
56 + dest="npl_fname",
57 + help="Input file of npl tagged file (crf output)",
58 + metavar="FILE",
59 + default="")
60 + parser.add_option(
61 + "--iOntoFile",
62 + dest="onto_fname",
63 + help="Input file with the ontology entities",
64 + metavar="FILE",
65 + default="")
66 + parser.add_option(
67 + "--iLinksFile",
68 + dest="links_fname",
69 + help="Input file with links and id for the ontology",
70 + metavar="FILE",
71 + default=None)
72 + parser.add_option(
73 + "--iSynFile",
74 + dest="syn_fname",
75 + help="Input file for the additional ontology of synonyms",
76 + metavar="FILE",
77 + default=None)
78 + parser.add_option(
79 + "--outputPath",
80 + dest="output_path",
81 + help="Output path to place output files",
82 + metavar="PATH")
83 + parser.add_option(
84 + "--outputFile",
85 + dest="out_fname",
86 + help="Output file name for the mapping process",
87 + metavar="FILE",
88 + default="")
89 + parser.add_option(
90 + "--minPerMatch",
91 + dest="min_score",
92 + help="Minimal string matching percentage")
93 + parser.add_option(
94 + "--minCRFProbs",
95 + dest="min_probs",
96 + help="Minimal crf probabilities")
97 +
98 + (options, args) = parser.parse_args()
99 + if len(args) > 0:
100 + parser.error("Any parameter given.")
101 + sys.exit(1)
102 +
103 + #################### DISP PARAMETERS ####################
104 + print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
105 + print("--inputPath Path of npl tagged file: " + str(options.input_path))
106 + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
107 + print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
108 + print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
109 + print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
110 + print("--outputPath Output path to place output files: " + str(options.output_path))
111 + print("--outputFile Output of the mapping process: " + str(options.out_fname))
112 + print("--minPerMatch Minimal string matching percentage: " + str(options.min_score))
113 + print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs))
114 +
115 + print("\n\n")
116 + repognrl = "http://pakal.ccg.unam.mx/cmendezc"
117 + reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
118 + repo_url = '/'.join([repognrl,reponame])
119 +
120 + # Input files
121 + min_score = int(options.min_score)
122 + min_probs = float(options.min_probs)
123 + npl_ifile = os.path.join(options.input_path, options.npl_fname)
124 + mco_ifile = os.path.join(options.input_path, options.onto_fname)
125 + mco_syn_ifile = os.path.join(options.input_path, options.syn_fname)
126 +
127 + #Output files
128 + raw_ofname = "_".join(["raw", options.out_fname])
129 + rawmap_ofile = os.path.join(options.output_path, raw_ofname)
130 + str_ofname = "_".join(["sim", options.out_fname])
131 + strmap_ofile = os.path.join(options.output_path, str_ofname)
132 +
133 + full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
134 + full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
135 +
136 + json_ofile = os.path.join(options.output_path, options.out_fname)
137 + json_ofile_map = json_ofile.replace(".tsv", "_map.json")
138 + json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
139 +
140 + #################### Load input data ####################
141 + # Load CRF-annotation
142 + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
143 + npl_full = read_table(npl_ifile, sep = "\t")
144 +
145 + obs_cols = set(npl_full.columns)
146 +
147 + if exp_cols.intersection(obs_cols) != exp_cols:
148 + ocol = ", ".join(list(exp_cols))
149 + sys.exit(ocol + " expected columns for iAnnotatedFile" )
150 +
151 + npl_df = npl_full[npl_full.PROB >= min_probs]
152 + npl_df = npl_df.drop_duplicates(keep="first")
153 + npl_df = npl_df.dropna()
154 +
155 +
156 + #Cleaning input
157 + npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE]
158 + #filter non-mco terms types
159 + npl_df = npl_df[npl_df.TERM_TYPE != "exTag Type"]
160 +
161 +
162 + #add repofile_ source. access to stored files at gitLab
163 + source_access = ['/'.join([repo_url,gse,gse+'.soft.gz']) for gse in npl_df['GSE']]
164 + npl_df['REPO_FILE'] = source_access
165 +
166 + ##remove additional spaces
167 + npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']]
168 + npl_df['PMID'] = [pmid.replace("PMID_", "") for pmid in npl_df['PMID']]
169 +
170 +
171 + #Load MCO term names
172 + exp_cols = {"TERM_ID", "TERM_NAME"}
173 + mco_df_full = read_table(mco_ifile, sep = "\t")
174 + obs_cols = set(mco_df_full.columns)
175 +
176 + if exp_cols.intersection(obs_cols) != exp_cols:
177 + sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
178 +
179 + mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
180 + mco_df = mco_df.drop_duplicates(keep="first")
181 + mco_df = mco_df.dropna()
182 +
183 + #Load MCO links
184 + if options.links_fname is not None:
185 + print("\nLoad types...")
186 + mcolink_ifile = os.path.join(options.input_path, options.links_fname)
187 + exp_cols = {"TERM_ID", "TERM_TYPE"}
188 + mco_links_full = read_table(mcolink_ifile, sep = "\t")
189 +
190 + obs_cols = set(mco_links_full.columns)
191 +
192 + if exp_cols.intersection(obs_cols) != exp_cols:
193 + sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
194 +
195 + mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
196 + mco_links = mco_links.drop_duplicates(keep="first")
197 + mco_links = mco_links.dropna()
198 + else:
199 + mco_links = None
200 +
201 + #Load MCO terms synonyms
202 + #format json from mco to dataframe
203 + mco_json = open(mco_syn_ifile )
204 + data = json.load(mco_json)
205 + mco_syn = format_fun.json2DataFrame(data)
206 +
207 +
208 + print('\n\n-------------------------------- INPUTS --------------------------------\n')
209 +
210 +
211 + print("\nnpl tagged file\n")
212 + print(npl_df.head(3))
213 + print("\nontology entities\n")
214 + print(mco_df.head(3))
215 + if options.links_fname is not None:
216 + print("\nlinks and id for the ontology (MCO-type-links)\n")
217 + print(mco_links.head(3))
218 + print("\nadditional ontology of synonyms (MCO-syn-json)\n")
219 + print(mco_syn.head(3))
220 +
221 +
222 + print('\n\n-------------------------------- RESULTS --------------------------------\n')
223 +
224 + #################### mappping to MCO exact string ####################
225 + #npl_df = npl_df.drop_duplicates("TERM_NAME", keep="first")
226 + #npl_df = npl_df.head(10)
227 +
228 + print(f"\nMapping {len(npl_df.index)} terms to MCO based on exact strings...\n")
229 +
230 + #first mapping
231 + raw_matches = mapping_fun.raw_map_mco(
232 + npl_df = npl_df,
233 + mco_df = mco_df,
234 + mco_links = mco_links,
235 + unmap = True)
236 +
237 + #save file name source of the raw mapping
238 + raw_matches["SOURCE"] = mco_ifile
239 + #additional column to merge
240 + raw_matches["ENTITY_NAME"] = ""
241 +
242 + #################### mappping to MCO.syn exact string ####################
243 +
244 + #define unmapped
245 + raw_mco_unmap = raw_matches[raw_matches.isna().TERM_ID]
246 + #input for te second step
247 + raw_mco_unmap = raw_mco_unmap[list(npl_df.columns)]
248 +
249 + print(f"\nMapping {len(raw_mco_unmap.index)} terms to MCO - synonyms based on exact strings...\n")
250 +
251 + #second mapping
252 + raw_matches_syn = mapping_fun.raw_map_mco(
253 + npl_df = raw_mco_unmap,
254 + mco_df = mco_syn,
255 + unmap = True)
256 +
257 + #additional column to merge
258 + raw_matches_syn["SOURCE"] = mco_syn_ifile
259 + #raw_matches_syn["TERM_TYPE"] = ""
260 +
261 + #################### save mapped terms based on exact strings ####################
262 +
263 + #all mapped
264 + raw_map_odf = concat([raw_matches, raw_matches_syn], sort=True).dropna()
265 +
266 + print(raw_map_odf.head(3))
267 + print(f"Total of terms mapped by exact strings: {len(raw_map_odf.index)}")
268 + print("Saving filtered terms from raw mapping...\n\n")
269 +
270 + raw_map_odf.to_csv(rawmap_ofile, sep = "\t", header =True, index=False)
271 +
272 + #################### unmmaped raw terms ####################
273 + raw_mco_syn_unmap = raw_matches_syn[raw_matches_syn.isna().TERM_ID]
274 + raw_mco_syn_unmap = raw_mco_syn_unmap[list(npl_df.columns)]
275 +
276 + print(f"\n{len(raw_mco_syn_unmap.index)} unmapped terms based on exact strings")
277 + print("Dropping duplicated unmapped term names...\n")
278 + raw_mco_syn_unmap = raw_mco_syn_unmap.drop_duplicates("TERM_NAME")
279 +
280 + print(f"{len(raw_mco_syn_unmap.index)} unmapped unique terms based on exact strings\n")
281 +
282 + #################### string similarity mapping ####################
283 + ###Matching unmaped term names
284 + print(f"\nMapping to MCO {len(raw_mco_syn_unmap.index)} terms based on string similarity...\n")
285 +
286 + str_matches = mapping_fun.str_match_map_mco(raw_mco_syn_unmap, mco_df, mco_links = mco_links, min_match=0, npl_merges=False)
287 + str_matches_odf = str_matches[str_matches.SET >= min_score]
288 + str_matches_odf["SOURCE"] = mco_ifile
289 +
290 + #################### unmmaped sim terms (MCO) ####################
291 + str_mco_unmap = str_matches[str_matches.SET < min_score]
292 + #str_mco_unmap = str_mco_unmap[list(npl_df.columns)]
293 + str_mco_unmap = str_mco_unmap.drop_duplicates("TERM_NAME")
294 +
295 + print(f"\nMapping to MCO - synonyms {len(str_mco_unmap.index)} terms based on string siilarity..\n")
296 + str_matches_syn = mapping_fun.str_match_map_mco(str_mco_unmap, mco_syn, min_match=min_score, npl_merges=False)
297 + str_matches_syn_odf = str_matches_syn[str_matches_syn.SET >= min_score]
298 + str_matches_syn_odf["SOURCE"] = mco_syn_ifile
299 +
300 + #################### save str-sim map terms ####################
301 + all_str_matches_odf = concat([str_matches_odf, str_matches_syn_odf], sort = True).dropna()
302 +
303 + print(f"Unique terms mapped by string similarity: {len(all_str_matches_odf.index)}")
304 +
305 + all_str_matches_npl_odf = merge(npl_df, all_str_matches_odf, on = ["TERM_NAME"], how="inner")
306 +
307 + print(f"Total of terms mapped by string similarity: {len(all_str_matches_npl_odf.index)}")
308 + print("Saving filtered terms from str mapping...")
309 +
310 + all_str_matches_npl_odf.to_csv(strmap_ofile, sep = "\t", header =True, index=False)
311 +
312 + #################### Formatting json ####################
313 + raw_map_odf["CASE_MATCH"] = "MCO"
314 + raw_map_odf["SET"] = 100
315 + raw_map_odf["SORT"] = 100
316 +
317 + full_map = concat([all_str_matches_npl_odf, raw_map_odf], sort = True)
318 +
319 + full_map.to_csv(full_ofile, sep = "\t", header =True, index=False)
320 +
321 + print(f"Total of terms mapped: {len(full_map.index)}")
322 +
323 + df_json = defaultdict(list)
324 +
325 + for idx,row in full_map.iterrows():
326 + record = format_fun.created_record(row)
327 + record_json = record
328 + df_json[row.SRR].append(record_json)
329 + if(idx <2):
330 + print(record_json)
331 +
332 + with open(json_ofile_map, "a") as output:
333 + output.write("{")
334 + sep=""
335 + for k,v in df_json.items():
336 + if v!={}:
337 + output.write(sep)
338 + output.write("\""+k+"\"")
339 + output.write(":")
340 + record_list = {
341 + "growth_conditions": v
342 + }
343 + json.dump(record_list, output)
344 + sep=","
345 + output.write("}")
346 +
347 + df=open(json_ofile_map)
348 + df=json.load(df)
349 +
350 +
351 + full_unmap = merge(npl_df, full_map[["TERM_NAME", "TERM_ID"]], on = ["TERM_NAME"], how='left')
352 + full_unmap = full_unmap[full_unmap.isna().TERM_ID]
353 + print(full_unmap.head(3))
354 +
355 + print(f"Total of terms unmapped: {len(full_unmap.index)}")
356 +
357 + full_unmap["SOURCE"] = ""
358 + full_unmap["CASE_MATCH"] = ""
359 + full_unmap["SET"] = 0
360 + full_unmap["SORT"] = 0
361 +
362 + full_unmap.to_csv(full_unmap_ofile, sep = "\t", header =True, index=False)
363 +
364 + df_json = defaultdict(list)
365 +
366 + for idx,row in full_unmap.iterrows():
367 + record = format_fun.created_record(row, no_map=True)
368 + #record_json = json.dumps(record)
369 + record_json = record
370 + df_json[row.SRR].append(record_json)
371 + if(idx <2):
372 + print(record_json)
373 +
374 + with open(json_ofile_unmap, "a") as output:
375 + output.write("{")
376 + sep=""
377 + for k,v in df_json.items():
378 + output.write(sep)
379 + output.write("\""+k+"\"")
380 + output.write(":")
381 + record_list = {
382 + "growth_conditions": df_json[k]
383 + }
384 + json.dump(record_list, output)
385 + sep=","
386 + output.write("}")
387 +
388 + df=open(json_ofile_unmap)
389 + df=json.load(df)
390 +
391 +
392 +
1 +# -*- coding: utf-8 -*-
2 +"""
3 +#Setup
4 +"""
5 +
6 +#################### Setup ####################
7 +from optparse import OptionParser
8 +import os
9 +from numpy.core.fromnumeric import sort
10 +from pandas import read_csv, DataFrame, merge, concat, read_table
11 +from numpy import exp, nan
12 +import seaborn as sns
13 +from numpy import mean
14 +
15 +import matplotlib.pyplot as plt
16 +import matplotlib
17 +matplotlib.style.use('ggplot')
18 +# %matplotlib inline
19 +
20 +from collections import Counter, defaultdict
21 +import json
22 +
23 +from fuzzywuzzy import fuzz
24 +from fuzzywuzzy import process
25 +
26 +#import format_fun
27 +import format_fun_v6 as format_fun
28 +import mapping_fun
29 +import sys
30 +
31 +"""
32 +# input parameters
33 +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
34 +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
35 +--iOntoFile gc_ontology_terms_v2.txt
36 +--iLinksFile gc_ontology_terms_link_v2.txt
37 +--iSynFile mco_terms_v0.2.json
38 +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
39 +--outputFile all_srr_IV_mapped.tsv
40 +--minPerMatch 90
41 +
42 +
43 +#Example
44 +# nohup python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile srr_htregulondb_v2.tsv --minPerMatch 80 --minCRFProbs 0.9 > /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/srr_htregulondb_mapping_report_v2.out &
45 +"""
46 +#################### Defining parameters ####################
47 +if __name__ == "__main__":
48 + parser = OptionParser()
49 + parser.add_option(
50 + "--inputPath",
51 + dest="input_path",
52 + help="Path of npl tagged file (crf output)",
53 + metavar="PATH")
54 + parser.add_option(
55 + "--iAnnotatedFile",
56 + dest="npl_fname",
57 + help="Input file of npl tagged file (crf output)",
58 + metavar="FILE",
59 + default="")
60 + parser.add_option(
61 + "--iOntoFile",
62 + dest="onto_fname",
63 + help="Input file with the ontology entities",
64 + metavar="FILE",
65 + default="")
66 + parser.add_option(
67 + "--iLinksFile",
68 + dest="links_fname",
69 + help="Input file with links and id for the ontology",
70 + metavar="FILE",
71 + default=None)
72 + parser.add_option(
73 + "--iSynFile",
74 + dest="syn_fname",
75 + help="Input file for the additional ontology of synonyms",
76 + metavar="FILE",
77 + default=None)
78 + parser.add_option(
79 + "--outputPath",
80 + dest="output_path",
81 + help="Output path to place output files",
82 + metavar="PATH")
83 + parser.add_option(
84 + "--outputFile",
85 + dest="out_fname",
86 + help="Output file name for the mapping process",
87 + metavar="FILE",
88 + default="")
89 + parser.add_option(
90 + "--minPerMatch",
91 + dest="min_score",
92 + help="Minimal string matching percentage")
93 + parser.add_option(
94 + "--minCRFProbs",
95 + dest="min_probs",
96 + help="Minimal crf probabilities")
97 +
98 + (options, args) = parser.parse_args()
99 + if len(args) > 0:
100 + parser.error("Any parameter given.")
101 + sys.exit(1)
102 +
103 + #################### DISP PARAMETERS ####################
104 + print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
105 + print("--inputPath Path of npl tagged file: " + str(options.input_path))
106 + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
107 + print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
108 + print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
109 + print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
110 + print("--outputPath Output path to place output files: " + str(options.output_path))
111 + print("--outputFile Output of the mapping process: " + str(options.out_fname))
112 + print("--minPerMatch Minimal string matching percentage: " + str(options.min_score))
113 + print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs))
114 +
115 + print("\n\n")
116 + repognrl = "http://pakal.ccg.unam.mx/cmendezc"
117 + reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
118 + repo_url = '/'.join([repognrl,reponame])
119 +
120 + # Input files ========================================================================================
121 + min_score = int(options.min_score)
122 + min_probs = float(options.min_probs)
123 + npl_ifile = os.path.join(options.input_path, options.npl_fname)
124 + mco_ifile = os.path.join(options.input_path, options.onto_fname)
125 + mco_syn_ifile = os.path.join(options.input_path, options.syn_fname)
126 +
127 + # Output files =======================================================================================
128 +
129 + #Save by mapping stratergy
130 + raw_ofname = "_".join(["raw", options.out_fname])
131 + rawmap_ofile = os.path.join(options.output_path, raw_ofname)
132 + str_ofname = "_".join(["sim", options.out_fname])
133 + strmap_ofile = os.path.join(options.output_path, str_ofname)
134 +
135 + #Saving map und unmap
136 + full_map_ofile = os.path.join(options.output_path, "full_map_"+options.out_fname)
137 + full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
138 +
139 + #Save JSONs
140 + json_ofile = os.path.join(options.output_path, options.out_fname)
141 + json_ofile_map = json_ofile.replace(".tsv", "_map.json")
142 + json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
143 + json_ofile_full = json_ofile.replace(".tsv", "_full.json")
144 +
145 + # Load input data ====================================================================================
146 +
147 + #Columns for the NPL-CRF extraction
148 + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
149 +
150 + #Load CRF-annotation
151 + npl_full = read_table(npl_ifile, sep = "\t")
152 +
153 + #Check input
154 + obs_cols = set(npl_full.columns)
155 + if exp_cols.intersection(obs_cols) != exp_cols:
156 + ocol = ", ".join(list(exp_cols))
157 + sys.exit(ocol + " expected columns for iAnnotatedFile" )
158 +
159 + #Filter Input by probs
160 + npl_df = npl_full[npl_full.PROB >= min_probs]
161 + npl_df = npl_df.drop_duplicates(keep="first")
162 + npl_df = npl_df.dropna()
163 +
164 + #Cleaning input
165 + npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE]
166 + #filter non-mco terms types
167 + npl_df = npl_df[npl_df.TERM_TYPE != "exTag Type"]
168 +
169 + #add repofile_ source. access to stored files at gitLab
170 + source_access = ['/'.join([repo_url,gse,gse+'.soft.gz']) for gse in npl_df['GSE']]
171 + npl_df['REPO_FILE'] = source_access
172 +
173 + ##remove additional spaces
174 + npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']]
175 + npl_df['PMID'] = [pmid.replace("PMID_", "") for pmid in npl_df['PMID']]
176 +
177 + #Columns for MCO
178 + exp_cols = {"TERM_ID", "TERM_NAME"}
179 +
180 + #Load MCO term names
181 + mco_df_full = read_table(mco_ifile, sep = "\t")
182 +
183 + #Check input MCO
184 + obs_cols = set(mco_df_full.columns)
185 + if exp_cols.intersection(obs_cols) != exp_cols:
186 + sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
187 +
188 + #Clean MCO input
189 + mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
190 + mco_df = mco_df.drop_duplicates(keep="first")
191 + mco_df = mco_df.dropna()
192 +
193 + #Load MCO links
194 + if options.links_fname is not None:
195 + print("\nLoad types...")
196 + mcolink_ifile = os.path.join(options.input_path, options.links_fname)
197 + exp_cols = {"TERM_ID", "TERM_TYPE"}
198 + mco_links_full = read_table(mcolink_ifile, sep = "\t")
199 +
200 + obs_cols = set(mco_links_full.columns)
201 +
202 + if exp_cols.intersection(obs_cols) != exp_cols:
203 + sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
204 +
205 + mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
206 + mco_links = mco_links.drop_duplicates(keep="first")
207 + mco_links = mco_links.dropna()
208 + else:
209 + mco_links = None
210 +
211 + #Load MCO terms synonyms
212 + mco_json = open(mco_syn_ifile )
213 +
214 + #format json from mco to dataframe
215 + data = json.load(mco_json)
216 + mco_syn = format_fun.json2DataFrame(data)
217 +
218 +
219 + print('\n\n-------------------------------- INPUTS --------------------------------\n')
220 +
221 + print("\nnpl tagged file\n")
222 + print(npl_df.head(3))
223 + print("\nontology entities\n")
224 + print(mco_df.head(3))
225 + if options.links_fname is not None:
226 + print("\nlinks and id for the ontology (MCO-type-links)\n")
227 + print(mco_links.head(3))
228 + print("\nadditional ontology of synonyms (MCO-syn-json)\n")
229 + print(mco_syn.head(3))
230 +
231 +
232 + print('\n\n-------------------------------- RESULTS --------------------------------\n')
233 +
234 + #################### mappping to MCO exact string ####################
235 + #npl_df = npl_df.drop_duplicates("TERM_NAME", keep="first")
236 + #npl_df = npl_df.head(10)
237 +
238 + print("\nTracking exact terms to MCO...")
239 + print(f"\nMapping {len(npl_df.index)} terms to MCO based on exact strings...")
240 +
241 + #Exact mapping to MCO
242 + raw_matches = mapping_fun.raw_map_mco(
243 + npl_df = npl_df,
244 + mco_df = mco_df,
245 + mco_links = mco_links,
246 + unmap = True)
247 +
248 + #save file name source of the raw mapping
249 + raw_matches["SOURCE"] = mco_ifile
250 + #additional column to merge
251 + raw_matches["ENTITY_NAME"] = ""
252 +
253 + #################### mappping to MCO.syn exact string ####################
254 +
255 + #define unmapped
256 + raw_mco_unmap = raw_matches[raw_matches.isna().TERM_ID]
257 + #input for te second step
258 + raw_mco_unmap = raw_mco_unmap[list(npl_df.columns)]
259 +
260 + print(f"\nMapping {len(raw_mco_unmap.index)} terms to MCO - synonyms based on exact strings...\n")
261 +
262 + #exact mapping to synonims
263 + raw_matches_syn = mapping_fun.raw_map_mco(
264 + npl_df = raw_mco_unmap,
265 + mco_df = mco_syn,
266 + unmap = True)
267 +
268 + #additional column to merge
269 + raw_matches_syn["SOURCE"] = mco_syn_ifile
270 + #raw_matches_syn["TERM_TYPE"] = ""
271 +
272 + #################### save mapped terms based on exact strings ####################
273 +
274 + #all mapped
275 + raw_map_odf = concat(
276 + [raw_matches, raw_matches_syn],
277 + sort=True).dropna()
278 +
279 + #print(raw_map_odf.head(3))
280 + print(f"Total of terms mapped by exact strings: {len(raw_map_odf.index)}")
281 + print("Saving filtered terms from raw mapping...\n\n")
282 +
283 + raw_map_odf.to_csv(
284 + rawmap_ofile,
285 + sep = "\t",
286 + header =True,
287 + index=False)
288 +
289 + #################### unmmaped raw terms ####################
290 + raw_mco_syn_unmap = raw_matches_syn[raw_matches_syn.isna().TERM_ID]
291 + raw_mco_syn_unmap = raw_mco_syn_unmap[list(npl_df.columns)]
292 +
293 + print(f"{len(raw_mco_syn_unmap.index)} unmapped terms based on exact strings")
294 + print("Dropping duplicated unmapped term names...")
295 + raw_mco_syn_unmap = raw_mco_syn_unmap.drop_duplicates("TERM_NAME")
296 +
297 + print(f"{len(raw_mco_syn_unmap.index)} unmapped unique terms based on exact strings")
298 +
299 + #################### string similarity mapping ####################
300 + ###Matching unmaped terms by string similarity
301 + print("\ncompute string similarty...")
302 +
303 + print(f"\nMapping to MCO {len(raw_mco_syn_unmap.index)} terms based on string similarity...")
304 +
305 + str_matches = mapping_fun.str_match_map_mco(
306 + raw_mco_syn_unmap, mco_df,
307 + mco_links = mco_links,
308 + min_match=0,
309 + npl_merges=False)
310 +
311 + str_matches_odf = str_matches[str_matches.SET >= min_score]
312 + str_matches_odf["SOURCE"] = mco_ifile
313 +
314 + #################### unmmaped sim terms (MCO) ####################
315 + str_mco_unmap = str_matches[str_matches.SET < min_score]
316 + #str_mco_unmap = str_mco_unmap[list(npl_df.columns)]
317 + str_mco_unmap = str_mco_unmap.drop_duplicates("TERM_NAME")
318 +
319 + print(f"\nMapping to MCO - synonyms {len(str_mco_unmap.index)} terms based on string siilarity..\n")
320 + str_matches_syn = mapping_fun.str_match_map_mco(
321 + str_mco_unmap, mco_syn,
322 + min_match=min_score,
323 + npl_merges=False)
324 +
325 + str_matches_syn_odf = str_matches_syn[str_matches_syn.SET >= min_score]
326 + str_matches_syn_odf["SOURCE"] = mco_syn_ifile
327 +
328 + #################### save str-sim map terms ####################
329 + all_str_matches_odf = concat(
330 + [str_matches_odf, str_matches_syn_odf],
331 + sort = True).dropna()
332 +
333 + print(f"Unique terms mapped by string similarity: {len(all_str_matches_odf.index)}")
334 +
335 + all_str_matches_npl_odf = merge(
336 + npl_df, all_str_matches_odf,
337 + on = ["TERM_NAME"],
338 + how="inner")
339 +
340 + print(f"Total of terms mapped by string similarity: {len(all_str_matches_npl_odf.index)}")
341 + print("Saving filtered terms from str mapping...\n\n")
342 +
343 + all_str_matches_npl_odf.to_csv(
344 + strmap_ofile,
345 + sep = "\t",
346 + header =True,
347 + index=False)
348 +
349 + #################### save all map terms ####################
350 + raw_map_odf["CASE_MATCH"] = "MCO"
351 + raw_map_odf["SET"] = 100
352 + raw_map_odf["SORT"] = 100
353 +
354 + full_map = concat(
355 + [all_str_matches_npl_odf, raw_map_odf],
356 + sort = True)
357 + full_map["MAP"]=True
358 +
359 + full_map.to_csv(full_map_ofile,
360 + sep = "\t",
361 + header =True,
362 + index=False)
363 +
364 + print("--------------------END----------------------")
365 + print(f"Total of terms mapped: {len(full_map.index)}\n")
366 +
367 + ###################### Merge all unmapped ######################
368 + full_unmap = merge(npl_df, full_map[["TERM_NAME", "TERM_ID"]], on = ["TERM_NAME"], how='left')
369 + full_unmap = full_unmap[full_unmap.isna().TERM_ID]
370 + #print(full_unmap.head(3))
371 +
372 + print(f"Total of terms unmapped: {len(full_unmap.index)}")
373 +
374 + full_unmap["SOURCE"] = ""
375 + full_unmap["CASE_MATCH"] = ""
376 + full_unmap["SET"] = 0
377 + full_unmap["SORT"] = 0
378 + full_unmap["MAP"]=False
379 +
380 + full_unmap.to_csv(
381 + full_unmap_ofile,
382 + sep = "\t",
383 + header =True,
384 + index=False)
385 +
386 + #################### Formatting json ####################
387 +
388 + format_fun.to_json(
389 + df = full_map,
390 + source_info = "GEO",
391 + evidence_source = "NPL-CRF",
392 + ofname = json_ofile_map
393 + )
394 +
395 +
396 + format_fun.to_json(
397 + df = full_unmap,
398 + source_info = "GEO",
399 + evidence_source = "NPL-CRF",
400 + ofname = json_ofile_unmap
401 + )
402 +
403 + #Merge output all
404 + full_merge = concat([full_map, full_unmap], sort=True)
405 + format_fun.to_json(
406 + df = full_merge,
407 + source_info = "GEO",
408 + evidence_source = "NPL-CRF",
409 + ofname = json_ofile_full
410 + )
411 +
...\ No newline at end of file ...\ No newline at end of file
1 +# -*- coding: utf-8 -*-
2 +"""
3 +#Setup
4 +"""
5 +
6 +#################### Setup ####################
7 +from optparse import OptionParser
8 +import os
9 +from numpy.core.fromnumeric import sort
10 +from pandas import read_csv, DataFrame, merge, concat, read_table
11 +from numpy import exp, nan
12 +import seaborn as sns
13 +from numpy import mean
14 +
15 +import matplotlib.pyplot as plt
16 +import matplotlib
17 +matplotlib.style.use('ggplot')
18 +# %matplotlib inline
19 +
20 +from collections import Counter, defaultdict
21 +import json
22 +
23 +from fuzzywuzzy import fuzz
24 +from fuzzywuzzy import process
25 +
26 +#import format_fun
27 +import format_fun_v6 as format_fun
28 +import mapping_fun
29 +import sys
30 +
31 +"""
32 +# input parameters
33 +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
34 +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
35 +--iOntoFile gc_ontology_terms_v2.txt
36 +--iLinksFile gc_ontology_terms_link_v2.txt
37 +--iSynFile mco_terms_v0.2.json
38 +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
39 +--outputFile all_srr_IV_mapped.tsv
40 +--minPerMatch 90
41 +
42 +
43 +#Example
44 +# nohup python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile srr_htregulondb_v2.tsv --minPerMatch 80 --minCRFProbs 0.9 > /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/srr_htregulondb_mapping_report_v2.out &
45 +"""
46 +#################### Defining parameters ####################
47 +if __name__ == "__main__":
48 + parser = OptionParser()
49 + parser.add_option(
50 + "--inputPath",
51 + dest="input_path",
52 + help="Path of npl tagged file (crf output)",
53 + metavar="PATH")
54 + parser.add_option(
55 + "--iAnnotatedFile",
56 + dest="npl_fname",
57 + help="Input file of npl tagged file (crf output)",
58 + metavar="FILE",
59 + default="")
60 + parser.add_option(
61 + "--iOntoFile",
62 + dest="onto_fname",
63 + help="Input file with the ontology entities",
64 + metavar="FILE",
65 + default="")
66 + parser.add_option(
67 + "--iLinksFile",
68 + dest="links_fname",
69 + help="Input file with links and id for the ontology",
70 + metavar="FILE",
71 + default=None)
72 + parser.add_option(
73 + "--iSynFile",
74 + dest="syn_fname",
75 + help="Input file for the additional ontology of synonyms",
76 + metavar="FILE",
77 + default=None)
78 + parser.add_option(
79 + "--outputPath",
80 + dest="output_path",
81 + help="Output path to place output files",
82 + metavar="PATH")
83 + parser.add_option(
84 + "--outputFile",
85 + dest="out_fname",
86 + help="Output file name for the mapping process",
87 + metavar="FILE",
88 + default="")
89 + parser.add_option(
90 + "--minPerMatch",
91 + dest="min_score",
92 + help="Minimal string matching percentage")
93 + parser.add_option(
94 + "--minCRFProbs",
95 + dest="min_probs",
96 + help="Minimal crf probabilities")
97 +
98 + (options, args) = parser.parse_args()
99 + if len(args) > 0:
100 + parser.error("Any parameter given.")
101 + sys.exit(1)
102 +
103 + #################### DISP PARAMETERS ####################
104 + print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
105 + print("--inputPath Path of npl tagged file: " + str(options.input_path))
106 + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
107 + print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
108 + print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
109 + print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
110 + print("--outputPath Output path to place output files: " + str(options.output_path))
111 + print("--outputFile Output of the mapping process: " + str(options.out_fname))
112 + print("--minPerMatch Minimal string matching percentage: " + str(options.min_score))
113 + print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs))
114 +
115 + print("\n\n")
116 + repognrl = "http://pakal.ccg.unam.mx/cmendezc"
117 + reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
118 + repo_url = '/'.join([repognrl,reponame])
119 +
120 + # Input files ========================================================================================
121 + min_score = int(options.min_score)
122 + min_probs = float(options.min_probs)
123 + npl_ifile = os.path.join(options.input_path, options.npl_fname)
124 + mco_ifile = os.path.join(options.input_path, options.onto_fname)
125 + mco_syn_ifile = os.path.join(options.input_path, options.syn_fname)
126 +
127 + # Output files =======================================================================================
128 +
129 + #Save by mapping stratergy
130 + raw_ofname = "_".join(["raw", options.out_fname])
131 + rawmap_ofile = os.path.join(options.output_path, raw_ofname)
132 + str_ofname = "_".join(["sim", options.out_fname])
133 + strmap_ofile = os.path.join(options.output_path, str_ofname)
134 +
135 + #Saving map und unmap
136 + full_map_ofile = os.path.join(options.output_path, "full_map_"+options.out_fname)
137 + full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
138 +
139 + #Save JSONs
140 + json_ofile = os.path.join(options.output_path, options.out_fname)
141 + json_ofile_map = json_ofile.replace(".tsv", "_map.json")
142 + json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
143 + json_ofile_full = json_ofile.replace(".tsv", "_full.json")
144 +
145 + # Load input data ====================================================================================
146 +
147 + #Columns for the NPL-CRF extraction
148 + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
149 +
150 + #Load CRF-annotation
151 + npl_full = read_table(npl_ifile, sep = "\t")
152 +
153 + #Check input
154 + obs_cols = set(npl_full.columns)
155 + if exp_cols.intersection(obs_cols) != exp_cols:
156 + ocol = ", ".join(list(exp_cols))
157 + sys.exit(ocol + " expected columns for iAnnotatedFile" )
158 +
159 + #Filter Input by probs
160 + npl_df = npl_full[npl_full.PROB >= min_probs]
161 + npl_df = npl_df.drop(columns=["PROB"])
162 + npl_df = npl_df.drop_duplicates(keep="first")
163 + npl_df = npl_df.dropna()
164 +
165 + #Cleaning input
166 + npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE]
167 + #filter non-mco terms types
168 + npl_df = npl_df[npl_df.TERM_TYPE != "exTag Type"]
169 +
170 + #add repofile_ source. access to stored files at gitLab
171 + source_access = ['/'.join([repo_url,gse,gse+'.soft.gz']) for gse in npl_df['GSE']]
172 + npl_df['REPO_FILE'] = source_access
173 +
174 + ##remove additional spaces
175 + npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']]
176 + npl_df['PMID'] = [pmid.replace("PMID_", "") for pmid in npl_df['PMID']]
177 +
178 + #Columns for MCO
179 + exp_cols = {"TERM_ID", "TERM_NAME"}
180 +
181 + #Load MCO term names
182 + mco_df_full = read_table(mco_ifile, sep = "\t")
183 +
184 + #Check input MCO
185 + obs_cols = set(mco_df_full.columns)
186 + if exp_cols.intersection(obs_cols) != exp_cols:
187 + sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
188 +
189 + #Clean MCO input
190 + mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
191 + mco_df = mco_df.drop_duplicates(keep="first")
192 + mco_df = mco_df.dropna()
193 +
194 + #Load MCO links
195 + if options.links_fname is not None:
196 + print("\nLoad types...")
197 + mcolink_ifile = os.path.join(options.input_path, options.links_fname)
198 + exp_cols = {"TERM_ID", "TERM_TYPE"}
199 + mco_links_full = read_table(mcolink_ifile, sep = "\t")
200 +
201 + obs_cols = set(mco_links_full.columns)
202 +
203 + if exp_cols.intersection(obs_cols) != exp_cols:
204 + sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
205 +
206 + mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
207 + mco_links = mco_links.drop_duplicates(keep="first")
208 + mco_links = mco_links.dropna()
209 + else:
210 + mco_links = None
211 +
212 + #Load MCO terms synonyms
213 + mco_json = open(mco_syn_ifile )
214 +
215 + #format json from mco to dataframe
216 + data = json.load(mco_json)
217 + mco_syn = format_fun.json2DataFrame(data)
218 +
219 +
220 + print('\n\n-------------------------------- INPUTS --------------------------------\n')
221 +
222 + print("\nnpl tagged file\n")
223 + print(npl_df.head(3))
224 + print("\nontology entities\n")
225 + print(mco_df.head(3))
226 + if options.links_fname is not None:
227 + print("\nlinks and id for the ontology (MCO-type-links)\n")
228 + print(mco_links.head(3))
229 + print("\nadditional ontology of synonyms (MCO-syn-json)\n")
230 + print(mco_syn.head(3))
231 +
232 +
233 + print('\n\n-------------------------------- RESULTS --------------------------------\n')
234 +
235 + #################### mappping to MCO exact string ####################
236 + #npl_df = npl_df.drop_duplicates("TERM_NAME", keep="first")
237 + #npl_df = npl_df.head(10)
238 +
239 + print("\nTracking exact terms to MCO...")
240 + print(f"\nMapping {len(npl_df.index)} terms to MCO based on exact strings...")
241 +
242 + #Exact mapping to MCO
243 + raw_matches = mapping_fun.raw_map_mco(
244 + npl_df = npl_df,
245 + mco_df = mco_df,
246 + mco_links = mco_links,
247 + unmap = True)
248 +
249 + #save file name source of the raw mapping
250 + raw_matches["SOURCE"] = mco_ifile
251 + #additional column to merge
252 + raw_matches["ENTITY_NAME"] = ""
253 +
254 + #################### mappping to MCO.syn exact string ####################
255 +
256 + #define unmapped
257 + raw_mco_unmap = raw_matches[raw_matches.isna().TERM_ID]
258 + #input for te second step
259 + raw_mco_unmap = raw_mco_unmap[list(npl_df.columns)]
260 +
261 + print(f"\nMapping {len(raw_mco_unmap.index)} terms to MCO - synonyms based on exact strings...\n")
262 +
263 + #exact mapping to synonims
264 + raw_matches_syn = mapping_fun.raw_map_mco(
265 + npl_df = raw_mco_unmap,
266 + mco_df = mco_syn,
267 + unmap = True)
268 +
269 + #additional column to merge
270 + raw_matches_syn["SOURCE"] = mco_syn_ifile
271 + #raw_matches_syn["TERM_TYPE"] = ""
272 +
273 + #################### save mapped terms based on exact strings ####################
274 +
275 + #all mapped
276 + raw_map_odf = concat(
277 + [raw_matches, raw_matches_syn],
278 + sort=True).dropna()
279 +
280 + #print(raw_map_odf.head(3))
281 + print(f"Total of terms mapped by exact strings: {len(raw_map_odf.index)}")
282 + print("Saving filtered terms from raw mapping...\n\n")
283 +
284 + raw_map_odf.to_csv(
285 + rawmap_ofile,
286 + sep = "\t",
287 + header =True,
288 + index=False)
289 +
290 + #################### unmmaped raw terms ####################
291 + raw_mco_syn_unmap = raw_matches_syn[raw_matches_syn.isna().TERM_ID]
292 + raw_mco_syn_unmap = raw_mco_syn_unmap[list(npl_df.columns)]
293 +
294 + print(f"{len(raw_mco_syn_unmap.index)} unmapped terms based on exact strings")
295 + print("Dropping duplicated unmapped term names...")
296 + raw_mco_syn_unmap = raw_mco_syn_unmap.drop_duplicates("TERM_NAME")
297 +
298 + print(f"{len(raw_mco_syn_unmap.index)} unmapped unique terms based on exact strings")
299 +
300 + #################### string similarity mapping ####################
301 + ###Matching unmaped terms by string similarity
302 + print("\ncompute string similarty...")
303 +
304 + print(f"\nMapping to MCO {len(raw_mco_syn_unmap.index)} terms based on string similarity...")
305 +
306 + str_matches = mapping_fun.str_match_map_mco(
307 + raw_mco_syn_unmap, mco_df,
308 + mco_links = mco_links,
309 + min_match=0,
310 + npl_merges=False)
311 +
312 + str_matches_odf = str_matches[str_matches.SET >= min_score]
313 + str_matches_odf["SOURCE"] = mco_ifile
314 +
315 + #################### unmmaped sim terms (MCO) ####################
316 + str_mco_unmap = str_matches[str_matches.SET < min_score]
317 + #str_mco_unmap = str_mco_unmap[list(npl_df.columns)]
318 + str_mco_unmap = str_mco_unmap.drop_duplicates("TERM_NAME")
319 +
320 + print(f"\nMapping to MCO - synonyms {len(str_mco_unmap.index)} terms based on string siilarity..\n")
321 + str_matches_syn = mapping_fun.str_match_map_mco(
322 + str_mco_unmap, mco_syn,
323 + min_match=min_score,
324 + npl_merges=False)
325 +
326 + str_matches_syn_odf = str_matches_syn[str_matches_syn.SET >= min_score]
327 + str_matches_syn_odf["SOURCE"] = mco_syn_ifile
328 +
329 + #################### save str-sim map terms ####################
330 + all_str_matches_odf = concat(
331 + [str_matches_odf, str_matches_syn_odf],
332 + sort = True).dropna()
333 +
334 + print(f"Unique terms mapped by string similarity: {len(all_str_matches_odf.index)}")
335 +
336 + all_str_matches_npl_odf = merge(
337 + npl_df, all_str_matches_odf,
338 + on = ["TERM_NAME"],
339 + how="inner")
340 +
341 + print(f"Total of terms mapped by string similarity: {len(all_str_matches_npl_odf.index)}")
342 + print("Saving filtered terms from str mapping...\n\n")
343 +
344 + all_str_matches_npl_odf.to_csv(
345 + strmap_ofile,
346 + sep = "\t",
347 + header =True,
348 + index=False)
349 +
350 + #################### save all map terms ####################
351 + raw_map_odf["CASE_MATCH"] = "MCO"
352 + raw_map_odf["SET"] = 100
353 + raw_map_odf["SORT"] = 100
354 +
355 + full_map = concat(
356 + [all_str_matches_npl_odf, raw_map_odf],
357 + sort = True)
358 + full_map["MAP"]=True
359 +
360 + full_map.to_csv(full_map_ofile,
361 + sep = "\t",
362 + header =True,
363 + index=False)
364 +
365 + print("--------------------END----------------------")
366 + print(f"Total of terms mapped: {len(full_map.index)}\n")
367 +
368 + ###################### Merge all unmapped ######################
369 + full_unmap = merge(npl_df, full_map[["TERM_NAME", "TERM_ID"]], on = ["TERM_NAME"], how='left')
370 + full_unmap = full_unmap[full_unmap.isna().TERM_ID]
371 + #print(full_unmap.head(3))
372 +
373 + print(f"Total of terms unmapped: {len(full_unmap.index)}")
374 +
375 + full_unmap["SOURCE"] = ""
376 + full_unmap["CASE_MATCH"] = ""
377 + full_unmap["SET"] = 0
378 + full_unmap["SORT"] = 0
379 + full_unmap["MAP"]=False
380 +
381 + full_unmap.to_csv(
382 + full_unmap_ofile,
383 + sep = "\t",
384 + header =True,
385 + index=False)
386 +
387 + #################### Formatting json ####################
388 +
389 + format_fun.to_json(
390 + df = full_map,
391 + source_info = "GEO",
392 + evidence_source = "NPL-CRF",
393 + ofname = json_ofile_map
394 + )
395 +
396 +
397 + format_fun.to_json(
398 + df = full_unmap,
399 + source_info = "GEO",
400 + evidence_source = "NPL-CRF",
401 + ofname = json_ofile_unmap
402 + )
403 +
404 + #Merge output all
405 + full_merge = concat([full_map, full_unmap], sort=True)
406 + format_fun.to_json(
407 + df = full_merge,
408 + source_info = "GEO",
409 + evidence_source = "NPL-CRF",
410 + ofname = json_ofile_full
411 + )
412 +
1 +from pandas import read_csv, merge
2 +crf_output_file = "/home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/srr_htregulondb/srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv"
3 +annot_file = "/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv"
4 +filter_ofile = "/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv"
5 +
6 +annot = read_csv(annot_file, names = ["TERM_TYPE", "TERM_NAME"] )
7 +annot.TERM_NAME = [text.strip() for text in annot.TERM_NAME]
8 +crf_ouput = read_csv(crf_output_file,
9 + names = ["SRR","GSE","GSM","GPL","PMID","GSM_NAME",
10 + "GSE_NAME","GPL_NAME","BANGLINE",
11 + "SOURCE_TEXT_CTRL","FULL_TEXT","TERM_NAME",
12 + "TERM_TYPE","PROB"],
13 + sep = "\t")
14 +crf_ouput.TERM_NAME = [text.strip() for text in crf_ouput.TERM_NAME]
15 +
16 +paso1 = merge( annot,crf_ouput, on = ["TERM_TYPE", "TERM_NAME"] )
17 +paso1 = paso1.reindex(columns = ["SRR","GSE","GSM","GPL","PMID","GSM_NAME",
18 + "GSE_NAME","GPL_NAME","BANGLINE",
19 + "SOURCE_TEXT_CTRL","FULL_TEXT","TERM_NAME",
20 + "TERM_TYPE","PROB"])
21 +
22 +paso1.to_csv(filter_ofile,sep="\t",index=False,header=True)
This diff could not be displayed because it is too large.
1 +GC_type,GC_term
2 +Agit,250 rpm
3 +Air,Aerobic
4 +Air,Aerobic and anaerobic
5 +Air,aerobically
6 +Air,anaerobic
7 +Gtype,{ delta } baeR
8 +Gtype,{ delta } cpxR
9 +Gtype,{ delta } cspABCEG
10 +Gtype,{ delta } cspABEG
11 +Gtype,{ delta } cspBG
12 +Gtype,{ delta } hns
13 +Gtype,{ delta } kdpE
14 +Gtype,{ delta } nusG
15 +Gtype,{ delta } perC : : kanR
16 +Gtype,{ delta } phoB
17 +Gtype,{ delta } rho
18 +Gtype,{ delta } rnr
19 +Gtype,{ delta } zraR
20 +Gtype,: φO104
21 +Gtype,DH5α ( pAR060302 )
22 +Gtype,E.coli K12 BW25113
23 +Gtype,K12 MG1655
24 +Gtype,K12 MG1655 deltaprfC
25 +Gtype,K12 MG1655 prfB-Bstrain allele
26 +Gtype,K12 MG1655 prfB-Bstrain allele deltaprfC
27 +Gtype,K12 MG1657
28 +Gtype,K12 MG1667
29 +Gtype,K12 MG1668
30 +Gtype,K12 MG1672
31 +Gtype,K12 MG1673
32 +Gtype,K12 MG1674
33 +Gtype,K12 W3110
34 +Gtype,MC4100 ∆ tig : : kan pTig-TEV-Avi
35 +Gtype,O157 : H7 NCTC 12900
36 +Gtype,PNPase mutant
37 +Gtype,Pck over-expressed
38 +Gtype,RNase II mutant
39 +Gtype,RNase R mutant
40 +Gtype,W3110 6xHis-rpoD
41 +Gtype,W3110 6xHis-rpoD greA : : tet greB : : amp
42 +Gtype,"W3110 rpoC-6xHis : : kan greA : : tet , greB : : amp"
43 +Gtype,WT
44 +Gtype,WT WT
45 +Gtype,Wild type
46 +Gtype,Wild-type
47 +Gtype,Wildtype
48 +Gtype,cra KO ; BW25113 Dcra
49 +Gtype,cya mutant background
50 +Gtype,delta Crp
51 +Gtype,delta _ cra
52 +Gtype,delta-gadE
53 +Gtype,delta-gadW
54 +Gtype,delta-gadX
55 +Gtype,delta-oxyR
56 +Gtype,delta-soxR
57 +Gtype,delta-soxS
58 +Gtype,fepA knockout
59 +Gtype,fis mutant background
60 +Gtype,lacA knockout
61 +Gtype,lack Fis protein
62 +Gtype,lack H-NS protein
63 +Gtype,naive ( wild type )
64 +Gtype,ompR deletion mutant
65 +Gtype,phageO104 in the wrbA gene
66 +Gtype,phagePA8 in the argW gene
67 +Gtype,rng mutant
68 +Gtype,sdhC knockout
69 +Gtype,sigma70 WT
70 +Gtype,wild type
71 +Gtype,wild type ; MG1655
72 +Gtype,wild-type
73 +Gtype,wildtype
74 +Gtype,wt
75 +Gtype,yafC deletion
76 +Gtype,ybaO deletion
77 +Gtype,ybaQ deletion
78 +Gtype,ybiH deletion
79 +Gtype,ydcI deletion
80 +Gtype,yddM deletion
81 +Gtype,yeiE deletion
82 +Gtype,yheO deletion
83 +Gtype,yiaJ deletion
84 +Gtype,yieP deletion
85 +Gtype,Δcra
86 +Gtype,Δfur
87 +Gtype,ΔgadE
88 +Gtype,ΔgadW
89 +Gtype,ΔgadX
90 +Gtype,ΔoxyR
91 +Gtype,ΔsoxR
92 +Gtype,ΔsoxS
93 +Gtype,∆ cspABCEG
94 +Gtype,∆ cspABEG
95 +Gtype,∆ cspBG
96 +Gtype,∆ hfq : : cat )
97 +Gtype,∆ rnr
98 +Med,Bertani ( LB ) medium
99 +Med,Davis Minimal medium
100 +Med,LB
101 +Med,LB media
102 +Med,LB medium
103 +Med,"LB medium ,"
104 +Med,M9 + 4 g/L glc ( glucose minimal media )
105 +Med,M9 minimal media
106 +Med,M9 minimal medium
107 +Med,MOPS complete-glucose liquid media
108 +Med,MOPS glucose minimal medium
109 +Med,MOPS medium
110 +Med,Neidhardt MOPS Minimal Medium ( NM3 )
111 +Med,SB medium
112 +Med,SILAC
113 +Med,W2 minimal media
114 +Med,fresh DM500
115 +Med,fully supplemented MOPS glucose media
116 +Med,glucose-M9 minimal media
117 +Med,glucose-limited minimal medium
118 +Med,in fresh LB medium
119 +Med,minimal medium
120 +OD,O.D. 600nm 0.5
121 +OD,OD600 = 0.3
122 +OD,OD600 of about 0.8
123 +Phase,IspG1 strain
124 +Phase,exponential
125 +Phase,log phase
126 +Phase,log phase sample
127 +Phase,mid-log phase
128 +Phase,stationary
129 +Phase,stationary phase
130 +Supp,0.1 mM KCl
131 +Supp,0.2 % arabinose
132 +Supp,0.2 % glucose
133 +Supp,0.2 % glutamine
134 +Supp,0.2 mM of DPD
135 +Supp,0.3 % glucose
136 +Supp,0.3 M of NaCl
137 +Supp,0.4 % glucose
138 +Supp,0.5 % glucose
139 +Supp,100 μM IPTG
140 +Supp,1mM IPTG
141 +Supp,2 mM Hydrogen peroxide
142 +Supp,22 mM glucose
143 +Supp,250 uM of paraquat
144 +Supp,2g/L glucose
145 +Supp,2g/L glucose and 1 mM cytidine
146 +Supp,4g/L glucose
147 +Supp,50 µM NiCl2
148 +Supp,70 µM IPTG
149 +Supp,DPD
150 +Supp,Fe
151 +Supp,IPTG
152 +Supp,IPTG was
153 +Supp,L-trp
154 +Supp,Xgal and IPTG
155 +Supp,acetate
156 +Supp,ade
157 +Supp,arabinose
158 +Supp,fructose
159 +Supp,glucose
160 +Supp,glutamine
161 +Supp,induced 50 µM IPTG
162 +Supp,mM IPTG
163 +Supp,mM IPTG + 50μg/ml Amp
164 +Supp,rhamnose
165 +Supp,rifampicin
166 +Supp,rifampicin and
167 +Supp,rifampicin time point
168 +Supp,rifampicin time point 0
169 +Supp,rifampicin time point 4
170 +Supp,rifampicin time point 6
171 +Supp,rifampicin time point 8
172 +Temp,10 °C
173 +Temp,30 °C
174 +Temp,37 °C
175 +Temp,37 ℃
176 +Temp,42 °C
177 +pH,pH 5.5
178 +pH,pH5 .5
1 +# -*- coding: utf-8 -*-
2 +"""
3 +#Setup
4 +"""
5 +
6 +#################### Setup ####################
7 +from collections import defaultdict
8 +from optparse import OptionParser
9 +import os
10 +from numpy.core.fromnumeric import sort
11 +from pandas import read_csv, DataFrame, merge, concat, read_table
12 +from numpy import exp, nan
13 +import seaborn as sns
14 +from numpy import mean
15 +
16 +import matplotlib.pyplot as plt
17 +import matplotlib
18 +matplotlib.style.use('ggplot')
19 +# %matplotlib inline
20 +
21 +from collections import Counter
22 +import json
23 +
24 +from fuzzywuzzy import fuzz
25 +from fuzzywuzzy import process
26 +
27 +import format_fun
28 +import mapping_fun
29 +import sys
30 +
31 +"""
32 +# input parameters
33 +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
34 +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
35 +--iOntoFile gc_ontology_terms_v2.txt
36 +--iLinksFile gc_ontology_terms_link_v2.txt
37 +--iSynFile mco_terms_v0.2.json
38 +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
39 +--outputFile all_srr_IV_mapped.tsv
40 +--minPerMatch 90
41 +
42 +
43 +#Example
44 +# nohup python3 mapping2MCO_v3.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile srr_htregulondb_mapped.tsv --minPerMatch 80 --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
45 +"""
46 +#################### Defining parameters ####################
47 +if __name__ == "__main__":
48 + parser = OptionParser()
49 + parser.add_option(
50 + "--inputPath",
51 + dest="input_path",
52 + help="Path of npl tagged file (crf output)",
53 + metavar="PATH")
54 + parser.add_option(
55 + "--iAnnotatedFile",
56 + dest="npl_fname",
57 + help="Input file of npl tagged file (crf output)",
58 + metavar="FILE",
59 + default="")
60 + parser.add_option(
61 + "--iOntoFile",
62 + dest="onto_fname",
63 + help="Input file with the ontology entities",
64 + metavar="FILE",
65 + default="")
66 + parser.add_option(
67 + "--iLinksFile",
68 + dest="links_fname",
69 + help="Input file with links and id for the ontology",
70 + metavar="FILE",
71 + default=None)
72 + parser.add_option(
73 + "--iSynFile",
74 + dest="syn_fname",
75 + help="Input file for the additional ontology of synonyms",
76 + metavar="FILE",
77 + default=None)
78 + parser.add_option(
79 + "--outputPath",
80 + dest="output_path",
81 + help="Output path to place output files",
82 + metavar="PATH")
83 + parser.add_option(
84 + "--outputFile",
85 + dest="out_fname",
86 + help="Output file name for the mapping process",
87 + metavar="FILE",
88 + default="")
89 + parser.add_option(
90 + "--minPerMatch",
91 + dest="min_score",
92 + help="Minimal string matching percentage")
93 + parser.add_option(
94 + "--minCRFProbs",
95 + dest="min_probs",
96 + help="Minimal crf probabilities")
97 +
98 + (options, args) = parser.parse_args()
99 + if len(args) > 0:
100 + parser.error("Any parameter given.")
101 + sys.exit(1)
102 +
103 + #################### DISP PARAMETERS ####################
104 + print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
105 + print("--inputPath Path of npl tagged file: " + str(options.input_path))
106 + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
107 + print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
108 + print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
109 + print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
110 + print("--outputPath Output path to place output files: " + str(options.output_path))
111 + print("--outputFile Output of the mapping process: " + str(options.out_fname))
112 + print("--minPerMatch Minimal string matching percentage: " + str(options.min_score))
113 + print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs))
114 +
115 + print("\n\n")
116 + repognrl = "http://pakal.ccg.unam.mx/cmendezc"
117 + reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
118 + repo_url = '/'.join([repognrl,reponame])
119 +
120 + # Input files
121 + min_score = int(options.min_score)
122 + min_probs = float(options.min_probs)
123 + npl_ifile = os.path.join(options.input_path, options.npl_fname)
124 + mco_ifile = os.path.join(options.input_path, options.onto_fname)
125 + mco_syn_ifile = os.path.join(options.input_path, options.syn_fname)
126 +
127 + #Output files
128 + raw_ofname = "_".join(["raw", options.out_fname])
129 + rawmap_ofile = os.path.join(options.output_path, raw_ofname)
130 + str_ofname = "_".join(["sim", options.out_fname])
131 + strmap_ofile = os.path.join(options.output_path, str_ofname)
132 +
133 + full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
134 + full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
135 +
136 + json_ofile = os.path.join(options.output_path, options.out_fname)
137 + json_ofile_map = json_ofile.replace(".tsv", "_map.json")
138 + json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
139 +
140 + #################### Load input data ####################
141 + # Load CRF-annotation
142 + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
143 + npl_full = read_table(npl_ifile, sep = "\t")
144 +
145 + obs_cols = set(npl_full.columns)
146 +
147 + if exp_cols.intersection(obs_cols) != exp_cols:
148 + ocol = ", ".join(list(exp_cols))
149 + sys.exit(ocol + " expected columns for iAnnotatedFile" )
150 +
151 + #Load MCO term names
152 + exp_cols = {"TERM_ID", "TERM_NAME"}
153 + mco_df_full = read_table(mco_ifile, sep = "\t")
154 + obs_cols = set(mco_df_full.columns)
155 +
156 + if exp_cols.intersection(obs_cols) != exp_cols:
157 + sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
158 +
159 + mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
160 + mco_df = mco_df.drop_duplicates(keep="first")
161 + mco_df = mco_df.dropna()
162 +
163 + #Load MCO links
164 + if options.links_fname is not None:
165 + print("\nLoad types...")
166 + mcolink_ifile = os.path.join(options.input_path, options.links_fname)
167 + exp_cols = {"TERM_ID", "TERM_TYPE"}
168 + mco_links_full = read_table(mcolink_ifile, sep = "\t")
169 +
170 + obs_cols = set(mco_links_full.columns)
171 +
172 + if exp_cols.intersection(obs_cols) != exp_cols:
173 + sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
174 +
175 + mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
176 + mco_links = mco_links.drop_duplicates(keep="first")
177 + mco_links = mco_links.dropna()
178 + else:
179 + mco_links = None
180 +
181 + #Load MCO terms synonyms
182 + #format json from mco to dataframe
183 + mco_json = open(mco_syn_ifile )
184 + data = json.load(mco_json)
185 + mco_syn = format_fun.json2DataFrame(data)
186 +
187 + df_json = defaultdict(list)
188 +
189 + for idx,row in full_unmap.iterrows():
190 + record = format_fun.created_record(row), output)
191 + df_json[row.SRR].append(record)
192 +
193 + df_json
194 + with open(json_ofile_list, "w") as output:
195 + json.dump(format_fun.created_record(df_json), output)
196 +
197 + with open(json_ofile_df_list, "a") as output:
198 + for idx,row in df_json.items():
199 + json.dump(format_fun.created_record(row), output)
...\ No newline at end of file ...\ No newline at end of file
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
2 + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
3 +/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning:
4 +A value is trying to be set on a copy of a slice from a DataFrame.
5 +Try using .loc[row_indexer,col_indexer] = value instead
6 +
7 +See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
8 + str_matches_odf["SOURCE"] = mco_ifile
9 +
10 +
11 +-------------------------------- PARAMETERS --------------------------------
12 +
13 +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
14 +--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
15 +--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
16 +--iLinksFile Input file with links and id for the ontology (MCO-type-links): None
17 +--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
18 +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
19 +--outputFile Output of the mapping process: srr_htregulondb.tsv
20 +--minPerMatch Minimal string matching percentage: 80
21 +--minCRFProbs Minimal crf probabilities allowed: 0.9
22 +
23 +
24 +
25 +
26 +
27 +-------------------------------- INPUTS --------------------------------
28 +
29 +
30 +npl tagged file
31 +
32 + SRR ... REPO_FILE
33 +0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
34 +5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
35 +7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
36 +
37 +[3 rows x 15 columns]
38 +
39 +ontology entities
40 +
41 + TERM_ID TERM_NAME
42 +0 MCO000000014 generically dependent continuant
43 +1 MCO000000015 radiation
44 +2 MCO000000016 electromagnetic radiation
45 +
46 +additional ontology of synonyms (MCO-syn-json)
47 +
48 + ENTITY_NAME TERM_ID TERM_NAME
49 +MCO000000019 continuant MCO000000019
50 +MCO000002475 culture medium MCO000002475
51 +MCO000002467_0 Organism MCO000002467 biologicentity
52 +
53 +
54 +-------------------------------- RESULTS --------------------------------
55 +
56 +
57 +Tracking exact terms to MCO...
58 +
59 +Mapping 4099 terms to MCO based on exact strings...
60 +
61 +Mapping 3770 terms to MCO - synonyms based on exact strings...
62 +
63 +Total of terms mapped by exact strings: 387
64 +Saving filtered terms from raw mapping...
65 +
66 +
67 +3712 unmapped terms based on exact strings
68 +Dropping duplicated unmapped term names...
69 +206 unmapped unique terms based on exact strings
70 +
71 +compute string similarty...
72 +
73 +Mapping to MCO 206 terms based on string similarity...
74 +
75 +Mapping to MCO - synonyms 152 terms based on string siilarity..
76 +
77 +Unique terms mapped by string similarity: 73
78 +Total of terms mapped by string similarity: 1992
79 +Saving filtered terms from str mapping...
80 +
81 +
82 +--------------------END----------------------
83 +Total of terms mapped: 2379
84 +
85 +Total of terms unmapped: 1720
1 +/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
2 + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
3 +/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning:
4 +A value is trying to be set on a copy of a slice from a DataFrame.
5 +Try using .loc[row_indexer,col_indexer] = value instead
6 +
7 +See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
8 + str_matches_odf["SOURCE"] = mco_ifile
9 +
10 +
11 +-------------------------------- PARAMETERS --------------------------------
12 +
13 +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
14 +--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
15 +--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
16 +--iLinksFile Input file with links and id for the ontology (MCO-type-links): None
17 +--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
18 +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
19 +--outputFile Output of the mapping process: srr_htregulondb.tsv
20 +--minPerMatch Minimal string matching percentage: 80
21 +--minCRFProbs Minimal crf probabilities allowed: 0.9
22 +
23 +
24 +
25 +
26 +
27 +-------------------------------- INPUTS --------------------------------
28 +
29 +
30 +npl tagged file
31 +
32 + SRR ... REPO_FILE
33 +0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
34 +5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
35 +7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
36 +
37 +[3 rows x 15 columns]
38 +
39 +ontology entities
40 +
41 + TERM_ID TERM_NAME
42 +0 MCO000000014 generically dependent continuant
43 +1 MCO000000015 radiation
44 +2 MCO000000016 electromagnetic radiation
45 +
46 +additional ontology of synonyms (MCO-syn-json)
47 +
48 + ENTITY_NAME TERM_ID TERM_NAME
49 +MCO000000019 continuant MCO000000019
50 +MCO000002475 culture medium MCO000002475
51 +MCO000002467_0 Organism MCO000002467 biologicentity
52 +
53 +
54 +-------------------------------- RESULTS --------------------------------
55 +
56 +
57 +Tracking exact terms to MCO...
58 +
59 +Mapping 4099 terms to MCO based on exact strings...
60 +
61 +Mapping 3770 terms to MCO - synonyms based on exact strings...
62 +
63 +Total of terms mapped by exact strings: 387
64 +Saving filtered terms from raw mapping...
65 +
66 +
67 +3712 unmapped terms based on exact strings
68 +Dropping duplicated unmapped term names...
69 +206 unmapped unique terms based on exact strings
70 +
71 +compute string similarty...
72 +
73 +Mapping to MCO 206 terms based on string similarity...
74 +
75 +Mapping to MCO - synonyms 152 terms based on string siilarity..
76 +
77 +Unique terms mapped by string similarity: 73
78 +Total of terms mapped by string similarity: 1992
79 +Saving filtered terms from str mapping...
80 +
81 +
82 +--------------------END----------------------
83 +Total of terms mapped: 2379
84 +
85 +Total of terms unmapped: 1720
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
2 + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
3 +/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v6.py:313: SettingWithCopyWarning:
4 +A value is trying to be set on a copy of a slice from a DataFrame.
5 +Try using .loc[row_indexer,col_indexer] = value instead
6 +
7 +See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
8 + str_matches_odf["SOURCE"] = mco_ifile
9 +
10 +
11 +-------------------------------- PARAMETERS --------------------------------
12 +
13 +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
14 +--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv
15 +--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
16 +--iLinksFile Input file with links and id for the ontology (MCO-type-links): None
17 +--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
18 +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v4/curated/
19 +--outputFile Output of the mapping process: srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv
20 +--minPerMatch Minimal string matching percentage: 80
21 +--minCRFProbs Minimal crf probabilities allowed: 0.9
22 +
23 +
24 +
25 +
26 +
27 +-------------------------------- INPUTS --------------------------------
28 +
29 +
30 +npl tagged file
31 +
32 + SRR ... REPO_FILE
33 +0 SRR771533 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
34 +2 SRR771534 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
35 +24 SRR3194453 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
36 +
37 +[3 rows x 14 columns]
38 +
39 +ontology entities
40 +
41 + TERM_ID TERM_NAME
42 +0 MCO000000014 generically dependent continuant
43 +1 MCO000000015 radiation
44 +2 MCO000000016 electromagnetic radiation
45 +
46 +additional ontology of synonyms (MCO-syn-json)
47 +
48 + ENTITY_NAME TERM_ID TERM_NAME
49 +MCO000000019 continuant MCO000000019
50 +MCO000002475 culture medium MCO000002475
51 +MCO000002467_0 Organism MCO000002467 biologicentity
52 +
53 +
54 +-------------------------------- RESULTS --------------------------------
55 +
56 +
57 +Tracking exact terms to MCO...
58 +
59 +Mapping 2149 terms to MCO based on exact strings...
60 +
61 +Mapping 1820 terms to MCO - synonyms based on exact strings...
62 +
63 +Total of terms mapped by exact strings: 387
64 +Saving filtered terms from raw mapping...
65 +
66 +
67 +1762 unmapped terms based on exact strings
68 +Dropping duplicated unmapped term names...
69 +104 unmapped unique terms based on exact strings
70 +
71 +compute string similarty...
72 +
73 +Mapping to MCO 104 terms based on string similarity...
74 +
75 +Mapping to MCO - synonyms 61 terms based on string siilarity..
76 +
77 +Unique terms mapped by string similarity: 58
78 +Total of terms mapped by string similarity: 1570
79 +Saving filtered terms from str mapping...
80 +
81 +
82 +--------------------END----------------------
83 +Total of terms mapped: 1957
84 +
85 +Total of terms unmapped: 192
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 +/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
2 + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
3 +/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning:
4 +A value is trying to be set on a copy of a slice from a DataFrame.
5 +Try using .loc[row_indexer,col_indexer] = value instead
6 +
7 +See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
8 + str_matches_odf["SOURCE"] = mco_ifile
9 +
10 +
11 +-------------------------------- PARAMETERS --------------------------------
12 +
13 +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
14 +--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
15 +--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
16 +--iLinksFile Input file with links and id for the ontology (MCO-type-links): None
17 +--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
18 +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v3/
19 +--outputFile Output of the mapping process: srr_htregulondb.tsv
20 +--minPerMatch Minimal string matching percentage: 80
21 +--minCRFProbs Minimal crf probabilities allowed: 0.9
22 +
23 +
24 +
25 +
26 +
27 +-------------------------------- INPUTS --------------------------------
28 +
29 +
30 +npl tagged file
31 +
32 + SRR ... REPO_FILE
33 +0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
34 +5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
35 +7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
36 +
37 +[3 rows x 15 columns]
38 +
39 +ontology entities
40 +
41 + TERM_ID TERM_NAME
42 +0 MCO000000014 generically dependent continuant
43 +1 MCO000000015 radiation
44 +2 MCO000000016 electromagnetic radiation
45 +
46 +additional ontology of synonyms (MCO-syn-json)
47 +
48 + ENTITY_NAME TERM_ID TERM_NAME
49 +MCO000000019 continuant MCO000000019
50 +MCO000002475 culture medium MCO000002475
51 +MCO000002467_0 Organism MCO000002467 biologicentity
52 +
53 +
54 +-------------------------------- RESULTS --------------------------------
55 +
56 +
57 +Tracking exact terms to MCO...
58 +
59 +Mapping 4099 terms to MCO based on exact strings...
60 +
61 +Mapping 3770 terms to MCO - synonyms based on exact strings...
62 +
63 +Total of terms mapped by exact strings: 387
64 +Saving filtered terms from raw mapping...
65 +
66 +
67 +3712 unmapped terms based on exact strings
68 +Dropping duplicated unmapped term names...
69 +206 unmapped unique terms based on exact strings
70 +
71 +compute string similarty...
72 +
73 +Mapping to MCO 206 terms based on string similarity...
74 +
75 +Mapping to MCO - synonyms 152 terms based on string siilarity..
76 +
77 +Unique terms mapped by string similarity: 73
78 +Total of terms mapped by string similarity: 1992
79 +Saving filtered terms from str mapping...
80 +
81 +
82 +--------------------END----------------------
83 +Total of terms mapped: 2379
84 +
85 +Total of terms unmapped: 1720
1 +/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
2 + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
3 +/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v6.py:313: SettingWithCopyWarning:
4 +A value is trying to be set on a copy of a slice from a DataFrame.
5 +Try using .loc[row_indexer,col_indexer] = value instead
6 +
7 +See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
8 + str_matches_odf["SOURCE"] = mco_ifile
9 +
10 +
11 +-------------------------------- PARAMETERS --------------------------------
12 +
13 +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
14 +--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
15 +--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
16 +--iLinksFile Input file with links and id for the ontology (MCO-type-links): None
17 +--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
18 +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v4/
19 +--outputFile Output of the mapping process: srr_htregulondb.tsv
20 +--minPerMatch Minimal string matching percentage: 80
21 +--minCRFProbs Minimal crf probabilities allowed: 0.9
22 +
23 +
24 +
25 +
26 +
27 +-------------------------------- INPUTS --------------------------------
28 +
29 +
30 +npl tagged file
31 +
32 + SRR ... REPO_FILE
33 +0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
34 +5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
35 +7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
36 +
37 +[3 rows x 14 columns]
38 +
39 +ontology entities
40 +
41 + TERM_ID TERM_NAME
42 +0 MCO000000014 generically dependent continuant
43 +1 MCO000000015 radiation
44 +2 MCO000000016 electromagnetic radiation
45 +
46 +additional ontology of synonyms (MCO-syn-json)
47 +
48 + ENTITY_NAME TERM_ID TERM_NAME
49 +MCO000000019 continuant MCO000000019
50 +MCO000002475 culture medium MCO000002475
51 +MCO000002467_0 Organism MCO000002467 biologicentity
52 +
53 +
54 +-------------------------------- RESULTS --------------------------------
55 +
56 +
57 +Tracking exact terms to MCO...
58 +
59 +Mapping 3769 terms to MCO based on exact strings...
60 +
61 +Mapping 3440 terms to MCO - synonyms based on exact strings...
62 +
63 +Total of terms mapped by exact strings: 387
64 +Saving filtered terms from raw mapping...
65 +
66 +
67 +3382 unmapped terms based on exact strings
68 +Dropping duplicated unmapped term names...
69 +206 unmapped unique terms based on exact strings
70 +
71 +compute string similarty...
72 +
73 +Mapping to MCO 206 terms based on string similarity...
74 +
75 +Mapping to MCO - synonyms 152 terms based on string siilarity..
76 +
77 +Unique terms mapped by string similarity: 73
78 +Total of terms mapped by string similarity: 1668
79 +Saving filtered terms from str mapping...
80 +
81 +
82 +--------------------END----------------------
83 +Total of terms mapped: 2055
84 +
85 +Total of terms unmapped: 1714
This diff could not be displayed because it is too large.
1 +
2 +
3 +-------------------------------- PARAMETERS --------------------------------
4 +
5 +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
6 +--iAnnotatedFile Input file of npl tagged file: No_GSM_Metadata_Selected_v4.tsv
7 +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
8 +--outputFile Output of the mapping process: zika.json
9 +
10 +
11 +
12 +Total zika terms: 2351