Showing
56 changed files
with
2812 additions
and
3 deletions
... | @@ -57,6 +57,12 @@ if (!length(opt)){ | ... | @@ -57,6 +57,12 @@ if (!length(opt)){ |
57 | ## Input files and output directories | 57 | ## Input files and output directories |
58 | infoFile <- opt$infoFile | 58 | infoFile <- opt$infoFile |
59 | 59 | ||
60 | +if (!"gse" %in% names(gseInfo)){ | ||
61 | + stop("include at least gse column") | ||
62 | +} | ||
63 | +if (!"gsm" %in% names(gseInfo)){ | ||
64 | + gseInfor$gsm <- "GSM" | ||
65 | +} | ||
60 | 66 | ||
61 | ## Load main variables | 67 | ## Load main variables |
62 | 68 | ||
... | @@ -89,4 +95,4 @@ for (geoid in unique(gseInfo$gse)) { | ... | @@ -89,4 +95,4 @@ for (geoid in unique(gseInfo$gse)) { |
89 | } | 95 | } |
90 | cat("download id: ", length(list.dirs(opt$downloadPath, recursive = FALSE))) | 96 | cat("download id: ", length(list.dirs(opt$downloadPath, recursive = FALSE))) |
91 | 97 | ||
92 | -message("Required GSE: ", ngse_down) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
98 | +message("Required GSE: ", ngse_down) | ... | ... |
No preview for this file type
No preview for this file type
No preview for this file type
... | @@ -42,6 +42,11 @@ def get_crossref_info(info_df): | ... | @@ -42,6 +42,11 @@ def get_crossref_info(info_df): |
42 | - **pmid**: PubMed ID | 42 | - **pmid**: PubMed ID |
43 | """ | 43 | """ |
44 | def get_cite_info(info_df): | 44 | def get_cite_info(info_df): |
45 | + if(info_df.CASE_MATCH == "ZIKA"): | ||
46 | + cite_dict ={ | ||
47 | + "evidence_id":"", | ||
48 | + "evidence_name":"ZIKA", | ||
49 | + "pmid" : info_df.PMID} | ||
45 | cite_dict ={ | 50 | cite_dict ={ |
46 | "evidence_id": "", | 51 | "evidence_id": "", |
47 | "evidence_name" : "NPL-CRF", #NPL | 52 | "evidence_name" : "NPL-CRF", #NPL |
... | @@ -49,6 +54,8 @@ def get_cite_info(info_df): | ... | @@ -49,6 +54,8 @@ def get_cite_info(info_df): |
49 | } | 54 | } |
50 | return(cite_dict) | 55 | return(cite_dict) |
51 | def get_description(info_df): | 56 | def get_description(info_df): |
57 | + if(info_df.CASE_MATCH=="ZIKA"): | ||
58 | + mco_mapping = {} | ||
52 | if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100: | 59 | if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100: |
53 | mco_mapping = { | 60 | mco_mapping = { |
54 | "type": "term present on MCO" | 61 | "type": "term present on MCO" | ... | ... |
mapping_MCO/bin/format_fun_v4.py
0 → 100755
1 | +from numpy import nan | ||
2 | +from collections import OrderedDict | ||
3 | +from pandas import DataFrame as DF | ||
4 | +""" | ||
5 | + - **name**: nombre del termino registrado en la MCO | ||
6 | + - **term_id**: identificador del termino en RegulonDB (si existe) | ||
7 | + - **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed" | ||
8 | + - **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC | ||
9 | + - **source**: fuente de los datos [ GEO, ] | ||
10 | + - **id**: identificador del registro de la base de datos o fuente de datos | ||
11 | + - **field**: campo de donde se esta tomando la informacion de la GC [ metadata field] | ||
12 | + - **associatedPhrase**: Frase de donde se tomo la informacion | ||
13 | +""" | ||
14 | +def get_term_info(info_df, source): | ||
15 | + term_dict = { | ||
16 | + "name": info_df.TERM_NAME, #NPL output | ||
17 | + "term_id" : info_df.TERM_ID, #MCO | ||
18 | + "term_type": info_df.TERM_TYPE, #NPL | ||
19 | + "source_data": info_df.REPO_FILE, #NPL | ||
20 | + "source": source, | ||
21 | + "id": info_df.GSM, #NPL | ||
22 | + "field": info_df.BANGLINE, #NPL | ||
23 | + "associatedPhrase": info_df.FULL_TEXT #NPL | ||
24 | + } | ||
25 | + return(term_dict) | ||
26 | + | ||
27 | + | ||
28 | +""" | ||
29 | + - **objectId**: Identificador en la base de datos fuente | ||
30 | + - **externalCrossReferences_name**: nombre de la DB [ GEO ] | ||
31 | +""" | ||
32 | +def get_crossref_info(info_df, source): | ||
33 | + crossref_dict ={ | ||
34 | + "objectId": info_df.GSM, #NPL | ||
35 | + "externalCrossReferences_name" : source | ||
36 | + } | ||
37 | + return(crossref_dict) | ||
38 | + | ||
39 | +""" | ||
40 | + - **evidence_id**: Identificador de RegulondB asociado a la evidencia | ||
41 | + - **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador | ||
42 | + - **pmid**: PubMed ID | ||
43 | +""" | ||
44 | +def get_cite_info(info_df, esource): | ||
45 | + cite_dict ={ | ||
46 | + "evidence_id": "", | ||
47 | + "evidence_name" : esource, | ||
48 | + "pmid": info_df.PMID | ||
49 | + } | ||
50 | + return(cite_dict) | ||
51 | + | ||
52 | +def get_description(info_df, no_map=False): | ||
53 | + if(no_map): | ||
54 | + mco_mapping = { | ||
55 | + "type": "not present on MCO" | ||
56 | + } | ||
57 | + elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100: | ||
58 | + mco_mapping = { | ||
59 | + "type": "term present on MCO" | ||
60 | + } | ||
61 | + else: | ||
62 | + mco_mapping = { | ||
63 | + "type": "string similarity", | ||
64 | + "score": info_df.SET | ||
65 | + } | ||
66 | + return(mco_mapping) | ||
67 | +""" | ||
68 | +#run it in the main for each field | ||
69 | + | ||
70 | + | ||
71 | +return: type | ||
72 | + | ||
73 | +id: string | ||
74 | +name: string | ||
75 | +description: string | ||
76 | +terms: list of dict | ||
77 | +externalCrossReferences: list of dict | ||
78 | +citations: list of dict | ||
79 | + | ||
80 | +""" | ||
81 | + | ||
82 | +def created_record(term_info_df, source = "GEO", no_map = False, esource = "NPL-CRF"): | ||
83 | + record_dict = OrderedDict() | ||
84 | + term_info_df = term_info_df.replace(nan, '', regex=True) | ||
85 | + record_dict["id"] = term_info_df.TERM_ID #it should be add if it have been mapped | ||
86 | + record_dict["name"] = term_info_df.TERM_NAME #a colum form NPL output | ||
87 | + record_dict["description"] = [get_description(term_info_df, no_map=no_map)] | ||
88 | + record_dict["terms"] = [get_term_info(term_info_df, source)] | ||
89 | + record_dict["externalCrossReferences"] = [get_crossref_info(term_info_df, source)] | ||
90 | + record_dict["citations"] = [get_cite_info(term_info_df, esource)] | ||
91 | + | ||
92 | + return(record_dict) | ||
93 | + | ||
94 | +def json2DataFrame(data): | ||
95 | + mco_syn_dic = dict() | ||
96 | + | ||
97 | + for j,i in enumerate(data): | ||
98 | + if "regulondb_id" in i.keys(): | ||
99 | + | ||
100 | + if "synonyms" in i.keys(): | ||
101 | + | ||
102 | + for k,syn in enumerate(i['synonyms']): | ||
103 | + dict_key = i['regulondb_id']+"_"+str(k) | ||
104 | + mco_syn_dic[dict_key] = { | ||
105 | + #ENTITY_NAME | ||
106 | + 'ENTITY_NAME' : i['name'], | ||
107 | + #ENITY_SYN | ||
108 | + 'TERM_NAME': syn.lower(), | ||
109 | + #regulondb_id | ||
110 | + 'TERM_ID' : i['regulondb_id'] } | ||
111 | + | ||
112 | + elif "hasRelatedSynonyms" in i.keys(): | ||
113 | + | ||
114 | + for k,syn in enumerate(i['hasRelatedSynonyms']): | ||
115 | + dict_key = i['regulondb_id']+"_"+str(k) | ||
116 | + mco_syn_dic[dict_key] = { | ||
117 | + #ENTITY_NAME | ||
118 | + 'ENTITY_NAME' : i['name'], | ||
119 | + #ENITY_SYN | ||
120 | + 'TERM_NAME': syn.lower(), | ||
121 | + #regulondb_id | ||
122 | + 'TERM_ID' : i['regulondb_id'] } | ||
123 | + else: | ||
124 | + dict_key = i['regulondb_id'] | ||
125 | + mco_syn_dic[dict_key] = { | ||
126 | + #ENTITY_NAME | ||
127 | + 'ENTITY_NAME' : i['name'], | ||
128 | + #ENITY_SYN | ||
129 | + 'TERM_NAME': '', | ||
130 | + #regulondb_id | ||
131 | + 'TERM_ID' : i['regulondb_id'] } | ||
132 | + | ||
133 | + mco_syn_df = DF.from_dict(mco_syn_dic).T | ||
134 | + | ||
135 | + | ||
136 | + return(mco_syn_df) | ||
137 | + |
mapping_MCO/bin/format_fun_v6.py
0 → 100755
1 | +from numpy import nan | ||
2 | +#from collections import OrderedDict | ||
3 | +from pandas import DataFrame as DF | ||
4 | +import json | ||
5 | +from collections import defaultdict | ||
6 | +import format_fun_v6 as format_fun | ||
7 | + | ||
8 | +def to_json(df, source_info, evidence_source, ofname): | ||
9 | + df_terms = defaultdict(list) | ||
10 | + | ||
11 | + for idx,row in df.iterrows(): | ||
12 | + term_record = format_fun.get_term_info( | ||
13 | + row, | ||
14 | + source = source_info, | ||
15 | + map= row.MAP) | ||
16 | + df_terms[row.SRR].append(term_record) | ||
17 | + | ||
18 | + df_json = {} | ||
19 | + df_tmp = df.drop_duplicates("SRR", keep="first") | ||
20 | + for idx,row in df_tmp.iterrows(): | ||
21 | + srr_record = format_fun.created_record( | ||
22 | + info_df = row, | ||
23 | + term_list = df_terms[row.SRR], | ||
24 | + source = source_info, | ||
25 | + esource = evidence_source) | ||
26 | + df_json[row.SRR] = srr_record | ||
27 | + | ||
28 | + with open(ofname, "w") as output: | ||
29 | + json.dump(df_json, output, separators=(',', ':'), indent=4) | ||
30 | + | ||
31 | +def get_score(info_df): | ||
32 | + if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100: | ||
33 | + subtext = "term present on MCO" | ||
34 | + else: | ||
35 | + mco_mapping = { | ||
36 | + "type": "string similarity", | ||
37 | + "score": info_df.SET | ||
38 | + } | ||
39 | + return(mco_mapping) | ||
40 | +""" | ||
41 | + - **name**: nombre del termino registrado en la MCO | ||
42 | + - **term_id**: identificador del termino en RegulonDB (si existe) | ||
43 | + - **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed" | ||
44 | + - **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC | ||
45 | + - **source**: fuente de los datos [ GEO, ] | ||
46 | + - **id**: identificador del registro de la base de datos o fuente de datos | ||
47 | + - **field**: campo de donde se esta tomando la informacion de la GC [ metadata field] | ||
48 | + - **associatedPhrase**: Frase de donde se tomo la informacion | ||
49 | +""" | ||
50 | +def get_term_info(info_df, source, map=True): | ||
51 | + info_df = info_df.replace(nan, "", regex=True) | ||
52 | + term_dict = { | ||
53 | + "name": info_df.TERM_NAME, #NPL output | ||
54 | + "term_id" : info_df.TERM_ID, #MCO | ||
55 | + "term_type": info_df.TERM_TYPE, #NPL | ||
56 | + "source_data": { | ||
57 | + "source": source, | ||
58 | + "id": info_df.GSM, #NPL | ||
59 | + "field": info_df.BANGLINE, #NPL | ||
60 | + "associatedPhrase": info_df.FULL_TEXT, #NPL | ||
61 | + "description" : get_description(info_df, map), | ||
62 | + "similarity_percentage" : info_df.SET | ||
63 | + } | ||
64 | + } | ||
65 | + return(term_dict) | ||
66 | + | ||
67 | + | ||
68 | +""" | ||
69 | + - **objectId**: Identificador en la base de datos fuente | ||
70 | + - **externalCrossReferences_name**: nombre de la DB [ GEO ] | ||
71 | +""" | ||
72 | +def get_crossref_info(info_df, source): | ||
73 | + crossref_dict ={ | ||
74 | + "objectId": info_df.GSM, #NPL | ||
75 | + "externalCrossReferences_name" : source | ||
76 | + } | ||
77 | + return(crossref_dict) | ||
78 | + | ||
79 | +""" | ||
80 | + - **evidence_id**: Identificador de RegulondB asociado a la evidencia | ||
81 | + - **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador | ||
82 | + - **pmid**: PubMed ID | ||
83 | +""" | ||
84 | +def get_cite_info(info_df, esource): | ||
85 | + cite_dict ={ | ||
86 | + "evidence_id": "", | ||
87 | + "evidence_name" : esource, | ||
88 | + "pmid": info_df.PMID | ||
89 | + } | ||
90 | + return(cite_dict) | ||
91 | + | ||
92 | +def get_description(info_df, map=True): | ||
93 | + if not map: | ||
94 | + subtext = "absent in RegulonDB MCO" | ||
95 | + elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100: | ||
96 | + subtext = "RegulonDB MCO term" | ||
97 | + else: | ||
98 | + subtext = "Similar term in RegulonDB MCO" | ||
99 | + return(subtext) | ||
100 | +""" | ||
101 | +#run it in the main for each field | ||
102 | + | ||
103 | + | ||
104 | +return: type | ||
105 | + | ||
106 | +id: string | ||
107 | +name: string | ||
108 | +description: string | ||
109 | +terms: list of dict | ||
110 | +externalCrossReferences: list of dict | ||
111 | +citations: list of dict | ||
112 | + | ||
113 | +""" | ||
114 | + | ||
115 | +def created_record(info_df, term_list, source = "GEO", esource = "NPL-CRF"): | ||
116 | + #record_dict = OrderedDict() | ||
117 | + record_dict = {} | ||
118 | + info_df = info_df.replace(nan, "", regex=True) | ||
119 | + record_dict["id"] = "" | ||
120 | + record_dict["name"] = "" | ||
121 | + record_dict["description"] = "" | ||
122 | + record_dict["terms"] = term_list | ||
123 | + record_dict["externalCrossReferences"] = [get_crossref_info(info_df, source)] | ||
124 | + record_dict["citations"] = [get_cite_info(info_df, esource)] | ||
125 | + | ||
126 | + return(record_dict) | ||
127 | + | ||
128 | +def json2DataFrame(data): | ||
129 | + mco_syn_dic = dict() | ||
130 | + | ||
131 | + for j,i in enumerate(data): | ||
132 | + if "regulondb_id" in i.keys(): | ||
133 | + | ||
134 | + if "synonyms" in i.keys(): | ||
135 | + | ||
136 | + for k,syn in enumerate(i['synonyms']): | ||
137 | + dict_key = i['regulondb_id']+"_"+str(k) | ||
138 | + mco_syn_dic[dict_key] = { | ||
139 | + #ENTITY_NAME | ||
140 | + 'ENTITY_NAME' : i['name'], | ||
141 | + #ENITY_SYN | ||
142 | + 'TERM_NAME': syn.lower(), | ||
143 | + #regulondb_id | ||
144 | + 'TERM_ID' : i['regulondb_id'] } | ||
145 | + | ||
146 | + elif "hasRelatedSynonyms" in i.keys(): | ||
147 | + | ||
148 | + for k,syn in enumerate(i['hasRelatedSynonyms']): | ||
149 | + dict_key = i['regulondb_id']+"_"+str(k) | ||
150 | + mco_syn_dic[dict_key] = { | ||
151 | + #ENTITY_NAME | ||
152 | + 'ENTITY_NAME' : i['name'], | ||
153 | + #ENITY_SYN | ||
154 | + 'TERM_NAME': syn.lower(), | ||
155 | + #regulondb_id | ||
156 | + 'TERM_ID' : i['regulondb_id'] } | ||
157 | + else: | ||
158 | + dict_key = i['regulondb_id'] | ||
159 | + mco_syn_dic[dict_key] = { | ||
160 | + #ENTITY_NAME | ||
161 | + 'ENTITY_NAME' : i['name'], | ||
162 | + #ENITY_SYN | ||
163 | + 'TERM_NAME': '', | ||
164 | + #regulondb_id | ||
165 | + 'TERM_ID' : i['regulondb_id'] } | ||
166 | + | ||
167 | + mco_syn_df = DF.from_dict(mco_syn_dic).T | ||
168 | + | ||
169 | + | ||
170 | + return(mco_syn_df) | ||
171 | + |
mapping_MCO/bin/format_zika_v3.py
0 → 100755
1 | +# -*- coding: utf-8 -*- | ||
2 | +""" | ||
3 | +#Setup | ||
4 | +""" | ||
5 | + | ||
6 | +#################### Setup #################### | ||
7 | +from collections import defaultdict | ||
8 | +from optparse import OptionParser | ||
9 | +import os | ||
10 | +from numpy.core.fromnumeric import sort | ||
11 | +from pandas import read_csv, DataFrame, merge, concat, read_table | ||
12 | +from numpy import exp, nan | ||
13 | +import seaborn as sns | ||
14 | +from numpy import mean | ||
15 | + | ||
16 | +import matplotlib.pyplot as plt | ||
17 | +import matplotlib | ||
18 | +matplotlib.style.use('ggplot') | ||
19 | +# %matplotlib inline | ||
20 | + | ||
21 | +from collections import Counter | ||
22 | +import json | ||
23 | + | ||
24 | +from fuzzywuzzy import fuzz | ||
25 | +from fuzzywuzzy import process | ||
26 | + | ||
27 | +import format_fun | ||
28 | +import mapping_fun | ||
29 | +import sys | ||
30 | + | ||
31 | +""" | ||
32 | +# input parameters | ||
33 | +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
34 | +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv | ||
35 | +--iOntoFile gc_ontology_terms_v2.txt | ||
36 | +--iLinksFile gc_ontology_terms_link_v2.txt | ||
37 | +--iSynFile mco_terms_v0.2.json | ||
38 | +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ | ||
39 | +--outputFile all_srr_IV_mapped.tsv | ||
40 | +--minPerMatch 90 | ||
41 | + | ||
42 | + | ||
43 | +#Example | ||
44 | +# nohup python3 mapping2MCO_v3.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile srr_htregulondb_mapped.tsv --minPerMatch 80 --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out & | ||
45 | +""" | ||
46 | +#################### Defining parameters #################### | ||
47 | +if __name__ == "__main__": | ||
48 | + parser = OptionParser() | ||
49 | + parser.add_option( | ||
50 | + "--inputPath", | ||
51 | + dest="input_path", | ||
52 | + help="Path of npl tagged file (crf output)", | ||
53 | + metavar="PATH") | ||
54 | + parser.add_option( | ||
55 | + "--iAnnotatedFile", | ||
56 | + dest="npl_fname", | ||
57 | + help="Input file of npl tagged file (crf output)", | ||
58 | + metavar="FILE", | ||
59 | + default="") | ||
60 | + parser.add_option( | ||
61 | + "--iOntoFile", | ||
62 | + dest="onto_fname", | ||
63 | + help="Input file with the ontology entities", | ||
64 | + metavar="FILE", | ||
65 | + default="") | ||
66 | + parser.add_option( | ||
67 | + "--iLinksFile", | ||
68 | + dest="links_fname", | ||
69 | + help="Input file with links and id for the ontology", | ||
70 | + metavar="FILE", | ||
71 | + default=None) | ||
72 | + parser.add_option( | ||
73 | + "--iSynFile", | ||
74 | + dest="syn_fname", | ||
75 | + help="Input file for the additional ontology of synonyms", | ||
76 | + metavar="FILE", | ||
77 | + default=None) | ||
78 | + parser.add_option( | ||
79 | + "--outputPath", | ||
80 | + dest="output_path", | ||
81 | + help="Output path to place output files", | ||
82 | + metavar="PATH") | ||
83 | + parser.add_option( | ||
84 | + "--outputFile", | ||
85 | + dest="out_fname", | ||
86 | + help="Output file name for the mapping process", | ||
87 | + metavar="FILE", | ||
88 | + default="") | ||
89 | + parser.add_option( | ||
90 | + "--minPerMatch", | ||
91 | + dest="min_score", | ||
92 | + help="Minimal string matching percentage") | ||
93 | + parser.add_option( | ||
94 | + "--minCRFProbs", | ||
95 | + dest="min_probs", | ||
96 | + help="Minimal crf probabilities") | ||
97 | + | ||
98 | + (options, args) = parser.parse_args() | ||
99 | + if len(args) > 0: | ||
100 | + parser.error("Any parameter given.") | ||
101 | + sys.exit(1) | ||
102 | + | ||
103 | + #################### DISP PARAMETERS #################### | ||
104 | + print('\n\n-------------------------------- PARAMETERS --------------------------------\n') | ||
105 | + print("--inputPath Path of npl tagged file: " + str(options.input_path)) | ||
106 | + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname)) | ||
107 | + print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname)) | ||
108 | + print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname)) | ||
109 | + print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname)) | ||
110 | + print("--outputPath Output path to place output files: " + str(options.output_path)) | ||
111 | + print("--outputFile Output of the mapping process: " + str(options.out_fname)) | ||
112 | + print("--minPerMatch Minimal string matching percentage: " + str(options.min_score)) | ||
113 | + print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs)) | ||
114 | + | ||
115 | + print("\n\n") | ||
116 | + repognrl = "http://pakal.ccg.unam.mx/cmendezc" | ||
117 | + reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb" | ||
118 | + repo_url = '/'.join([repognrl,reponame]) | ||
119 | + | ||
120 | + # Input files | ||
121 | + min_score = int(options.min_score) | ||
122 | + min_probs = float(options.min_probs) | ||
123 | + npl_ifile = os.path.join(options.input_path, options.npl_fname) | ||
124 | + mco_ifile = os.path.join(options.input_path, options.onto_fname) | ||
125 | + mco_syn_ifile = os.path.join(options.input_path, options.syn_fname) | ||
126 | + | ||
127 | + #Output files | ||
128 | + raw_ofname = "_".join(["raw", options.out_fname]) | ||
129 | + rawmap_ofile = os.path.join(options.output_path, raw_ofname) | ||
130 | + str_ofname = "_".join(["sim", options.out_fname]) | ||
131 | + strmap_ofile = os.path.join(options.output_path, str_ofname) | ||
132 | + | ||
133 | + full_ofile = os.path.join(options.output_path, "full_"+options.out_fname) | ||
134 | + full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname) | ||
135 | + | ||
136 | + json_ofile = os.path.join(options.output_path, options.out_fname) | ||
137 | + json_ofile_map = json_ofile.replace(".tsv", "_map.json") | ||
138 | + json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json") | ||
139 | + json_ofile_list= json_ofile.replace(".tsv", "_list.json") | ||
140 | + json_ofile_df_list= json_ofile.replace(".tsv", "_df_list.json") | ||
141 | + | ||
142 | + #################### Load input data #################### | ||
143 | + # Load CRF-annotation | ||
144 | + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"} | ||
145 | + npl_full = read_table(npl_ifile, sep = "\t") | ||
146 | + | ||
147 | + obs_cols = set(npl_full.columns) | ||
148 | + | ||
149 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
150 | + ocol = ", ".join(list(exp_cols)) | ||
151 | + sys.exit(ocol + " expected columns for iAnnotatedFile" ) | ||
152 | + | ||
153 | + #Load MCO term names | ||
154 | + exp_cols = {"TERM_ID", "TERM_NAME"} | ||
155 | + mco_df_full = read_table(mco_ifile, sep = "\t") | ||
156 | + obs_cols = set(mco_df_full.columns) | ||
157 | + | ||
158 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
159 | + sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" ) | ||
160 | + | ||
161 | + mco_df = mco_df_full[["TERM_ID","TERM_NAME"]] | ||
162 | + mco_df = mco_df.drop_duplicates(keep="first") | ||
163 | + mco_df = mco_df.dropna() | ||
164 | + | ||
165 | + #Load MCO links | ||
166 | + if options.links_fname is not None: | ||
167 | + print("\nLoad types...") | ||
168 | + mcolink_ifile = os.path.join(options.input_path, options.links_fname) | ||
169 | + exp_cols = {"TERM_ID", "TERM_TYPE"} | ||
170 | + mco_links_full = read_table(mcolink_ifile, sep = "\t") | ||
171 | + | ||
172 | + obs_cols = set(mco_links_full.columns) | ||
173 | + | ||
174 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
175 | + sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" ) | ||
176 | + | ||
177 | + mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]] | ||
178 | + mco_links = mco_links.drop_duplicates(keep="first") | ||
179 | + mco_links = mco_links.dropna() | ||
180 | + else: | ||
181 | + mco_links = None | ||
182 | + | ||
183 | + #Load MCO terms synonyms | ||
184 | + #format json from mco to dataframe | ||
185 | + mco_json = open(mco_syn_ifile ) | ||
186 | + data = json.load(mco_json) | ||
187 | + mco_syn = format_fun.json2DataFrame(data) | ||
188 | + | ||
189 | + df_json = defaultdict(list) | ||
190 | + | ||
191 | + for idx,row in npl_full.iterrows(): | ||
192 | + record = format_fun.created_record(row) | ||
193 | + df_json[row.SRR].append(record) | ||
194 | + | ||
195 | + df_json | ||
196 | + with open(json_ofile_list, "w") as output: | ||
197 | + json.dump(format_fun.created_record(df_json), output) | ||
198 | + | ||
199 | + with open(json_ofile_df_list, "a") as output: | ||
200 | + for idx,row in df_json.items(): | ||
201 | + json.dump(format_fun.created_record(row), output) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
mapping_MCO/bin/format_zika_v4.py
0 → 100755
1 | +# -*- coding: utf-8 -*- | ||
2 | +""" | ||
3 | +#Setup | ||
4 | +""" | ||
5 | + | ||
6 | +#################### Setup #################### | ||
7 | +from collections import defaultdict | ||
8 | +from optparse import OptionParser | ||
9 | +import os | ||
10 | +from pandas import read_csv, DataFrame, merge, concat, read_table | ||
11 | +from numpy import exp, nan, mean | ||
12 | +import json | ||
13 | +import format_fun_v4 as format_fun | ||
14 | +import sys | ||
15 | + | ||
16 | +""" | ||
17 | +# input parameters | ||
18 | +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
19 | +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv | ||
20 | +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ | ||
21 | +--outputFile all_srr_IV_mapped.tsv | ||
22 | + | ||
23 | + | ||
24 | +#Example | ||
25 | +# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v4.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v3.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile zika_mapped.json | ||
26 | +""" | ||
27 | +#################### Defining parameters #################### | ||
28 | +if __name__ == "__main__": | ||
29 | + parser = OptionParser() | ||
30 | + parser.add_option( | ||
31 | + "--inputPath", | ||
32 | + dest="input_path", | ||
33 | + help="Path of npl tagged file (crf output)", | ||
34 | + metavar="PATH") | ||
35 | + parser.add_option( | ||
36 | + "--iAnnotatedFile", | ||
37 | + dest="npl_fname", | ||
38 | + help="Input file of npl tagged file (crf output)", | ||
39 | + metavar="FILE", | ||
40 | + default="") | ||
41 | + parser.add_option( | ||
42 | + "--outputPath", | ||
43 | + dest="output_path", | ||
44 | + help="Output path to place output files", | ||
45 | + metavar="PATH") | ||
46 | + parser.add_option( | ||
47 | + "--outputFile", | ||
48 | + dest="out_fname", | ||
49 | + help="Output file name for the mapping process", | ||
50 | + metavar="FILE", | ||
51 | + default="") | ||
52 | + | ||
53 | + (options, args) = parser.parse_args() | ||
54 | + if len(args) > 0: | ||
55 | + parser.error("Any parameter given.") | ||
56 | + sys.exit(1) | ||
57 | + | ||
58 | + #################### DISP PARAMETERS #################### | ||
59 | + print('\n\n-------------------------------- PARAMETERS --------------------------------\n') | ||
60 | + print("--inputPath Path of npl tagged file: " + str(options.input_path)) | ||
61 | + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname)) | ||
62 | + print("--outputPath Output path to place output files: " + str(options.output_path)) | ||
63 | + print("--outputFile Output of the mapping process: " + str(options.out_fname)) | ||
64 | + | ||
65 | + print("\n\n") | ||
66 | + | ||
67 | + # Input files | ||
68 | + npl_ifile = os.path.join(options.input_path, options.npl_fname) | ||
69 | + | ||
70 | + #Output files | ||
71 | + ofname = os.path.join(options.output_path, options.out_fname) | ||
72 | + | ||
73 | + #################### Load input data #################### | ||
74 | + # Load CRF-annotation | ||
75 | + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"} | ||
76 | + npl_full = read_table(npl_ifile, sep = "\t") | ||
77 | + | ||
78 | + obs_cols = set(npl_full.columns) | ||
79 | + | ||
80 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
81 | + ocol = ", ".join(list(exp_cols)) | ||
82 | + sys.exit(ocol + " expected columns for iAnnotatedFile" ) | ||
83 | + | ||
84 | + df_json = defaultdict(list) | ||
85 | + | ||
86 | + for idx,row in npl_full.iterrows(): | ||
87 | + record = format_fun.created_record(row, source = "ZIKAdb", no_map = True, esource = "database") | ||
88 | + if(idx<2): print(record) | ||
89 | + #record_json = json.dumps(record) | ||
90 | + record_json = record | ||
91 | + df_json[row.SRR].append(record_json) | ||
92 | + | ||
93 | + """ | ||
94 | + with open(ofname, "a") as output: | ||
95 | + output.write("field:[") | ||
96 | + sep="" | ||
97 | + for k,v in df_json.items(): | ||
98 | + output.write(sep) | ||
99 | + json.dump(v, output) | ||
100 | + sep="," | ||
101 | + output.write("]") | ||
102 | + | ||
103 | + """ | ||
104 | + with open(ofname, "a") as output: | ||
105 | + output.write("{") | ||
106 | + sep="" | ||
107 | + for k,v in df_json.items(): | ||
108 | + output.write(sep) | ||
109 | + output.write("\""+k+"\"") | ||
110 | + output.write(":") | ||
111 | + record_list = { | ||
112 | + "growth_conditions": df_json[k] | ||
113 | + } | ||
114 | + json.dump(record_list, output) | ||
115 | + sep="," | ||
116 | + output.write("}") | ||
117 | + | ||
118 | + df=open(ofname) | ||
119 | + df=json.load(df) | ||
120 | + |
mapping_MCO/bin/format_zika_v5.py
0 → 100755
1 | +# -*- coding: utf-8 -*- | ||
2 | +""" | ||
3 | +#Setup | ||
4 | +""" | ||
5 | + | ||
6 | +#################### Setup #################### | ||
7 | +from optparse import OptionParser | ||
8 | +import os | ||
9 | +from pandas import read_csv, DataFrame, merge, concat, read_table | ||
10 | +from numpy import mean | ||
11 | +import format_fun_v6 as format_fun | ||
12 | +import sys | ||
13 | + | ||
14 | +""" | ||
15 | +# input parameters | ||
16 | +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
17 | +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv | ||
18 | +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ | ||
19 | +--outputFile all_srr_IV_mapped.tsv | ||
20 | + | ||
21 | + | ||
22 | +#Example | ||
23 | +# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile zika.json > automatic-extraction-growth-conditions/mapping_MCO/reports/zika_formated_report.out | ||
24 | +# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/test/ --outputFile zika_mapped.json > automatic-extraction-growth-conditions/mapping_MCO/test/zika_mapping_report.out | ||
25 | + | ||
26 | +""" | ||
27 | +#################### Defining parameters #################### | ||
28 | +if __name__ == "__main__": | ||
29 | + parser = OptionParser() | ||
30 | + parser.add_option( | ||
31 | + "--inputPath", | ||
32 | + dest="input_path", | ||
33 | + help="Path of npl tagged file (crf output)", | ||
34 | + metavar="PATH") | ||
35 | + parser.add_option( | ||
36 | + "--iAnnotatedFile", | ||
37 | + dest="npl_fname", | ||
38 | + help="Input file of npl tagged file (crf output)", | ||
39 | + metavar="FILE", | ||
40 | + default="") | ||
41 | + parser.add_option( | ||
42 | + "--outputPath", | ||
43 | + dest="output_path", | ||
44 | + help="Output path to place output files", | ||
45 | + metavar="PATH") | ||
46 | + parser.add_option( | ||
47 | + "--outputFile", | ||
48 | + dest="out_fname", | ||
49 | + help="Output file name for the mapping process", | ||
50 | + metavar="FILE", | ||
51 | + default="") | ||
52 | + | ||
53 | + (options, args) = parser.parse_args() | ||
54 | + if len(args) > 0: | ||
55 | + parser.error("Any parameter given.") | ||
56 | + sys.exit(1) | ||
57 | + | ||
58 | + #################### DISP PARAMETERS #################### | ||
59 | + print('\n\n-------------------------------- PARAMETERS --------------------------------\n') | ||
60 | + print("--inputPath Path of npl tagged file: " + str(options.input_path)) | ||
61 | + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname)) | ||
62 | + print("--outputPath Output path to place output files: " + str(options.output_path)) | ||
63 | + print("--outputFile Output of the mapping process: " + str(options.out_fname)) | ||
64 | + | ||
65 | + print("\n\n") | ||
66 | + | ||
67 | + # Input files | ||
68 | + npl_ifile = os.path.join(options.input_path, options.npl_fname) | ||
69 | + | ||
70 | + #Output files | ||
71 | + ofname = os.path.join(options.output_path, options.out_fname) | ||
72 | + | ||
73 | + #################### Load input data #################### | ||
74 | + # Load CRF-annotation | ||
75 | + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"} | ||
76 | + npl_full = read_table(npl_ifile, sep = "\t") | ||
77 | + npl_full = npl_full.drop_duplicates() | ||
78 | + | ||
79 | + print(f"Total zika terms: {len(npl_full)} ") | ||
80 | + obs_cols = set(npl_full.columns) | ||
81 | + | ||
82 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
83 | + ocol = ", ".join(list(exp_cols)) | ||
84 | + sys.exit(ocol + " expected columns for iAnnotatedFile" ) | ||
85 | + """ | ||
86 | + df_terms = defaultdict(list) | ||
87 | + | ||
88 | + for idx,row in npl_full.iterrows(): | ||
89 | + term_record = format_fun.get_term_info(row, source = "ZIKAdb", map=False) | ||
90 | + df_terms[row.SRR].append(term_record) | ||
91 | + | ||
92 | + df_json = {} | ||
93 | + df_tmp = npl_full.drop_duplicates("SRR", keep="first") | ||
94 | + for idx,row in df_tmp.iterrows(): | ||
95 | + srr_record = format_fun.created_record( | ||
96 | + info_df = row, | ||
97 | + term_list = df_terms[row.SRR], | ||
98 | + source = "ZIKAdb", | ||
99 | + esource = "database") | ||
100 | + df_json[row.SRR] = srr_record | ||
101 | + | ||
102 | + with open(ofname, "w") as output: | ||
103 | + json.dump(df_json, output, separators=(',', ':'), indent=4) | ||
104 | + | ||
105 | + df=open(ofname) | ||
106 | + df=json.load(df) | ||
107 | + print(df["ERR1399578"]) | ||
108 | + """ | ||
109 | + npl_full["MAP"] = False | ||
110 | + format_fun.to_json( | ||
111 | + df = npl_full, | ||
112 | + source_info = "ZIKAdb", | ||
113 | + evidence_source = "database", | ||
114 | + ofname = ofname) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
... | @@ -23,7 +23,8 @@ import json | ... | @@ -23,7 +23,8 @@ import json |
23 | from fuzzywuzzy import fuzz | 23 | from fuzzywuzzy import fuzz |
24 | from fuzzywuzzy import process | 24 | from fuzzywuzzy import process |
25 | 25 | ||
26 | -import format_fun | 26 | +#import format_fun |
27 | +import format_fun_v4 as format_fun | ||
27 | import mapping_fun | 28 | import mapping_fun |
28 | import sys | 29 | import sys |
29 | 30 | ||
... | @@ -338,5 +339,6 @@ if __name__ == "__main__": | ... | @@ -338,5 +339,6 @@ if __name__ == "__main__": |
338 | with open(json_ofile_unmap, "a") as output: | 339 | with open(json_ofile_unmap, "a") as output: |
339 | for idx,row in full_unmap.iterrows(): | 340 | for idx,row in full_unmap.iterrows(): |
340 | json.dump(format_fun.created_record(row), output) | 341 | json.dump(format_fun.created_record(row), output) |
342 | + | ||
343 | + | ||
341 | 344 | ||
342 | - | ||
... | \ No newline at end of file | ... | \ No newline at end of file | ... | ... |
mapping_MCO/bin/mapping2MCO_v4.py
0 → 100755
1 | +# -*- coding: utf-8 -*- | ||
2 | +""" | ||
3 | +#Setup | ||
4 | +""" | ||
5 | + | ||
6 | +#################### Setup #################### | ||
7 | +from optparse import OptionParser | ||
8 | +import os | ||
9 | +from numpy.core.fromnumeric import sort | ||
10 | +from pandas import read_csv, DataFrame, merge, concat, read_table | ||
11 | +from numpy import exp, nan | ||
12 | +import seaborn as sns | ||
13 | +from numpy import mean | ||
14 | + | ||
15 | +import matplotlib.pyplot as plt | ||
16 | +import matplotlib | ||
17 | +matplotlib.style.use('ggplot') | ||
18 | +# %matplotlib inline | ||
19 | + | ||
20 | +from collections import Counter, defaultdict | ||
21 | +import json | ||
22 | + | ||
23 | +from fuzzywuzzy import fuzz | ||
24 | +from fuzzywuzzy import process | ||
25 | + | ||
26 | +#import format_fun | ||
27 | +import format_fun_v4 as format_fun | ||
28 | +import mapping_fun | ||
29 | +import sys | ||
30 | + | ||
31 | +""" | ||
32 | +# input parameters | ||
33 | +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
34 | +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv | ||
35 | +--iOntoFile gc_ontology_terms_v2.txt | ||
36 | +--iLinksFile gc_ontology_terms_link_v2.txt | ||
37 | +--iSynFile mco_terms_v0.2.json | ||
38 | +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ | ||
39 | +--outputFile all_srr_IV_mapped.tsv | ||
40 | +--minPerMatch 90 | ||
41 | + | ||
42 | + | ||
43 | +#Example | ||
44 | +# nohup python3 mapping2MCO_v3.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile srr_htregulondb_mapped.tsv --minPerMatch 80 --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out & | ||
45 | +""" | ||
46 | +#################### Defining parameters #################### | ||
47 | +if __name__ == "__main__": | ||
48 | + parser = OptionParser() | ||
49 | + parser.add_option( | ||
50 | + "--inputPath", | ||
51 | + dest="input_path", | ||
52 | + help="Path of npl tagged file (crf output)", | ||
53 | + metavar="PATH") | ||
54 | + parser.add_option( | ||
55 | + "--iAnnotatedFile", | ||
56 | + dest="npl_fname", | ||
57 | + help="Input file of npl tagged file (crf output)", | ||
58 | + metavar="FILE", | ||
59 | + default="") | ||
60 | + parser.add_option( | ||
61 | + "--iOntoFile", | ||
62 | + dest="onto_fname", | ||
63 | + help="Input file with the ontology entities", | ||
64 | + metavar="FILE", | ||
65 | + default="") | ||
66 | + parser.add_option( | ||
67 | + "--iLinksFile", | ||
68 | + dest="links_fname", | ||
69 | + help="Input file with links and id for the ontology", | ||
70 | + metavar="FILE", | ||
71 | + default=None) | ||
72 | + parser.add_option( | ||
73 | + "--iSynFile", | ||
74 | + dest="syn_fname", | ||
75 | + help="Input file for the additional ontology of synonyms", | ||
76 | + metavar="FILE", | ||
77 | + default=None) | ||
78 | + parser.add_option( | ||
79 | + "--outputPath", | ||
80 | + dest="output_path", | ||
81 | + help="Output path to place output files", | ||
82 | + metavar="PATH") | ||
83 | + parser.add_option( | ||
84 | + "--outputFile", | ||
85 | + dest="out_fname", | ||
86 | + help="Output file name for the mapping process", | ||
87 | + metavar="FILE", | ||
88 | + default="") | ||
89 | + parser.add_option( | ||
90 | + "--minPerMatch", | ||
91 | + dest="min_score", | ||
92 | + help="Minimal string matching percentage") | ||
93 | + parser.add_option( | ||
94 | + "--minCRFProbs", | ||
95 | + dest="min_probs", | ||
96 | + help="Minimal crf probabilities") | ||
97 | + | ||
98 | + (options, args) = parser.parse_args() | ||
99 | + if len(args) > 0: | ||
100 | + parser.error("Any parameter given.") | ||
101 | + sys.exit(1) | ||
102 | + | ||
103 | + #################### DISP PARAMETERS #################### | ||
104 | + print('\n\n-------------------------------- PARAMETERS --------------------------------\n') | ||
105 | + print("--inputPath Path of npl tagged file: " + str(options.input_path)) | ||
106 | + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname)) | ||
107 | + print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname)) | ||
108 | + print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname)) | ||
109 | + print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname)) | ||
110 | + print("--outputPath Output path to place output files: " + str(options.output_path)) | ||
111 | + print("--outputFile Output of the mapping process: " + str(options.out_fname)) | ||
112 | + print("--minPerMatch Minimal string matching percentage: " + str(options.min_score)) | ||
113 | + print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs)) | ||
114 | + | ||
115 | + print("\n\n") | ||
116 | + repognrl = "http://pakal.ccg.unam.mx/cmendezc" | ||
117 | + reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb" | ||
118 | + repo_url = '/'.join([repognrl,reponame]) | ||
119 | + | ||
120 | + # Input files | ||
121 | + min_score = int(options.min_score) | ||
122 | + min_probs = float(options.min_probs) | ||
123 | + npl_ifile = os.path.join(options.input_path, options.npl_fname) | ||
124 | + mco_ifile = os.path.join(options.input_path, options.onto_fname) | ||
125 | + mco_syn_ifile = os.path.join(options.input_path, options.syn_fname) | ||
126 | + | ||
127 | + #Output files | ||
128 | + raw_ofname = "_".join(["raw", options.out_fname]) | ||
129 | + rawmap_ofile = os.path.join(options.output_path, raw_ofname) | ||
130 | + str_ofname = "_".join(["sim", options.out_fname]) | ||
131 | + strmap_ofile = os.path.join(options.output_path, str_ofname) | ||
132 | + | ||
133 | + full_ofile = os.path.join(options.output_path, "full_"+options.out_fname) | ||
134 | + full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname) | ||
135 | + | ||
136 | + json_ofile = os.path.join(options.output_path, options.out_fname) | ||
137 | + json_ofile_map = json_ofile.replace(".tsv", "_map.json") | ||
138 | + json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json") | ||
139 | + | ||
140 | + #################### Load input data #################### | ||
141 | + # Load CRF-annotation | ||
142 | + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"} | ||
143 | + npl_full = read_table(npl_ifile, sep = "\t") | ||
144 | + | ||
145 | + obs_cols = set(npl_full.columns) | ||
146 | + | ||
147 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
148 | + ocol = ", ".join(list(exp_cols)) | ||
149 | + sys.exit(ocol + " expected columns for iAnnotatedFile" ) | ||
150 | + | ||
151 | + npl_df = npl_full[npl_full.PROB >= min_probs] | ||
152 | + npl_df = npl_df.drop_duplicates(keep="first") | ||
153 | + npl_df = npl_df.dropna() | ||
154 | + | ||
155 | + | ||
156 | + #Cleaning input | ||
157 | + npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE] | ||
158 | + #filter non-mco terms types | ||
159 | + npl_df = npl_df[npl_df.TERM_TYPE != "exTag Type"] | ||
160 | + | ||
161 | + | ||
162 | + #add repofile_ source. access to stored files at gitLab | ||
163 | + source_access = ['/'.join([repo_url,gse,gse+'.soft.gz']) for gse in npl_df['GSE']] | ||
164 | + npl_df['REPO_FILE'] = source_access | ||
165 | + | ||
166 | + ##remove additional spaces | ||
167 | + npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']] | ||
168 | + npl_df['PMID'] = [pmid.replace("PMID_", "") for pmid in npl_df['PMID']] | ||
169 | + | ||
170 | + | ||
171 | + #Load MCO term names | ||
172 | + exp_cols = {"TERM_ID", "TERM_NAME"} | ||
173 | + mco_df_full = read_table(mco_ifile, sep = "\t") | ||
174 | + obs_cols = set(mco_df_full.columns) | ||
175 | + | ||
176 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
177 | + sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" ) | ||
178 | + | ||
179 | + mco_df = mco_df_full[["TERM_ID","TERM_NAME"]] | ||
180 | + mco_df = mco_df.drop_duplicates(keep="first") | ||
181 | + mco_df = mco_df.dropna() | ||
182 | + | ||
183 | + #Load MCO links | ||
184 | + if options.links_fname is not None: | ||
185 | + print("\nLoad types...") | ||
186 | + mcolink_ifile = os.path.join(options.input_path, options.links_fname) | ||
187 | + exp_cols = {"TERM_ID", "TERM_TYPE"} | ||
188 | + mco_links_full = read_table(mcolink_ifile, sep = "\t") | ||
189 | + | ||
190 | + obs_cols = set(mco_links_full.columns) | ||
191 | + | ||
192 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
193 | + sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" ) | ||
194 | + | ||
195 | + mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]] | ||
196 | + mco_links = mco_links.drop_duplicates(keep="first") | ||
197 | + mco_links = mco_links.dropna() | ||
198 | + else: | ||
199 | + mco_links = None | ||
200 | + | ||
201 | + #Load MCO terms synonyms | ||
202 | + #format json from mco to dataframe | ||
203 | + mco_json = open(mco_syn_ifile ) | ||
204 | + data = json.load(mco_json) | ||
205 | + mco_syn = format_fun.json2DataFrame(data) | ||
206 | + | ||
207 | + | ||
208 | + print('\n\n-------------------------------- INPUTS --------------------------------\n') | ||
209 | + | ||
210 | + | ||
211 | + print("\nnpl tagged file\n") | ||
212 | + print(npl_df.head(3)) | ||
213 | + print("\nontology entities\n") | ||
214 | + print(mco_df.head(3)) | ||
215 | + if options.links_fname is not None: | ||
216 | + print("\nlinks and id for the ontology (MCO-type-links)\n") | ||
217 | + print(mco_links.head(3)) | ||
218 | + print("\nadditional ontology of synonyms (MCO-syn-json)\n") | ||
219 | + print(mco_syn.head(3)) | ||
220 | + | ||
221 | + | ||
222 | + print('\n\n-------------------------------- RESULTS --------------------------------\n') | ||
223 | + | ||
224 | + #################### mappping to MCO exact string #################### | ||
225 | + #npl_df = npl_df.drop_duplicates("TERM_NAME", keep="first") | ||
226 | + #npl_df = npl_df.head(10) | ||
227 | + | ||
228 | + print(f"\nMapping {len(npl_df.index)} terms to MCO based on exact strings...\n") | ||
229 | + | ||
230 | + #first mapping | ||
231 | + raw_matches = mapping_fun.raw_map_mco( | ||
232 | + npl_df = npl_df, | ||
233 | + mco_df = mco_df, | ||
234 | + mco_links = mco_links, | ||
235 | + unmap = True) | ||
236 | + | ||
237 | + #save file name source of the raw mapping | ||
238 | + raw_matches["SOURCE"] = mco_ifile | ||
239 | + #additional column to merge | ||
240 | + raw_matches["ENTITY_NAME"] = "" | ||
241 | + | ||
242 | + #################### mappping to MCO.syn exact string #################### | ||
243 | + | ||
244 | + #define unmapped | ||
245 | + raw_mco_unmap = raw_matches[raw_matches.isna().TERM_ID] | ||
246 | + #input for te second step | ||
247 | + raw_mco_unmap = raw_mco_unmap[list(npl_df.columns)] | ||
248 | + | ||
249 | + print(f"\nMapping {len(raw_mco_unmap.index)} terms to MCO - synonyms based on exact strings...\n") | ||
250 | + | ||
251 | + #second mapping | ||
252 | + raw_matches_syn = mapping_fun.raw_map_mco( | ||
253 | + npl_df = raw_mco_unmap, | ||
254 | + mco_df = mco_syn, | ||
255 | + unmap = True) | ||
256 | + | ||
257 | + #additional column to merge | ||
258 | + raw_matches_syn["SOURCE"] = mco_syn_ifile | ||
259 | + #raw_matches_syn["TERM_TYPE"] = "" | ||
260 | + | ||
261 | + #################### save mapped terms based on exact strings #################### | ||
262 | + | ||
263 | + #all mapped | ||
264 | + raw_map_odf = concat([raw_matches, raw_matches_syn], sort=True).dropna() | ||
265 | + | ||
266 | + print(raw_map_odf.head(3)) | ||
267 | + print(f"Total of terms mapped by exact strings: {len(raw_map_odf.index)}") | ||
268 | + print("Saving filtered terms from raw mapping...\n\n") | ||
269 | + | ||
270 | + raw_map_odf.to_csv(rawmap_ofile, sep = "\t", header =True, index=False) | ||
271 | + | ||
272 | + #################### unmmaped raw terms #################### | ||
273 | + raw_mco_syn_unmap = raw_matches_syn[raw_matches_syn.isna().TERM_ID] | ||
274 | + raw_mco_syn_unmap = raw_mco_syn_unmap[list(npl_df.columns)] | ||
275 | + | ||
276 | + print(f"\n{len(raw_mco_syn_unmap.index)} unmapped terms based on exact strings") | ||
277 | + print("Dropping duplicated unmapped term names...\n") | ||
278 | + raw_mco_syn_unmap = raw_mco_syn_unmap.drop_duplicates("TERM_NAME") | ||
279 | + | ||
280 | + print(f"{len(raw_mco_syn_unmap.index)} unmapped unique terms based on exact strings\n") | ||
281 | + | ||
282 | + #################### string similarity mapping #################### | ||
283 | + ###Matching unmaped term names | ||
284 | + print(f"\nMapping to MCO {len(raw_mco_syn_unmap.index)} terms based on string similarity...\n") | ||
285 | + | ||
286 | + str_matches = mapping_fun.str_match_map_mco(raw_mco_syn_unmap, mco_df, mco_links = mco_links, min_match=0, npl_merges=False) | ||
287 | + str_matches_odf = str_matches[str_matches.SET >= min_score] | ||
288 | + str_matches_odf["SOURCE"] = mco_ifile | ||
289 | + | ||
290 | + #################### unmmaped sim terms (MCO) #################### | ||
291 | + str_mco_unmap = str_matches[str_matches.SET < min_score] | ||
292 | + #str_mco_unmap = str_mco_unmap[list(npl_df.columns)] | ||
293 | + str_mco_unmap = str_mco_unmap.drop_duplicates("TERM_NAME") | ||
294 | + | ||
295 | + print(f"\nMapping to MCO - synonyms {len(str_mco_unmap.index)} terms based on string siilarity..\n") | ||
296 | + str_matches_syn = mapping_fun.str_match_map_mco(str_mco_unmap, mco_syn, min_match=min_score, npl_merges=False) | ||
297 | + str_matches_syn_odf = str_matches_syn[str_matches_syn.SET >= min_score] | ||
298 | + str_matches_syn_odf["SOURCE"] = mco_syn_ifile | ||
299 | + | ||
300 | + #################### save str-sim map terms #################### | ||
301 | + all_str_matches_odf = concat([str_matches_odf, str_matches_syn_odf], sort = True).dropna() | ||
302 | + | ||
303 | + print(f"Unique terms mapped by string similarity: {len(all_str_matches_odf.index)}") | ||
304 | + | ||
305 | + all_str_matches_npl_odf = merge(npl_df, all_str_matches_odf, on = ["TERM_NAME"], how="inner") | ||
306 | + | ||
307 | + print(f"Total of terms mapped by string similarity: {len(all_str_matches_npl_odf.index)}") | ||
308 | + print("Saving filtered terms from str mapping...") | ||
309 | + | ||
310 | + all_str_matches_npl_odf.to_csv(strmap_ofile, sep = "\t", header =True, index=False) | ||
311 | + | ||
312 | + #################### Formatting json #################### | ||
313 | + raw_map_odf["CASE_MATCH"] = "MCO" | ||
314 | + raw_map_odf["SET"] = 100 | ||
315 | + raw_map_odf["SORT"] = 100 | ||
316 | + | ||
317 | + full_map = concat([all_str_matches_npl_odf, raw_map_odf], sort = True) | ||
318 | + | ||
319 | + full_map.to_csv(full_ofile, sep = "\t", header =True, index=False) | ||
320 | + | ||
321 | + print(f"Total of terms mapped: {len(full_map.index)}") | ||
322 | + | ||
323 | + df_json = defaultdict(list) | ||
324 | + | ||
325 | + for idx,row in full_map.iterrows(): | ||
326 | + record = format_fun.created_record(row) | ||
327 | + record_json = record | ||
328 | + df_json[row.SRR].append(record_json) | ||
329 | + if(idx <2): | ||
330 | + print(record_json) | ||
331 | + | ||
332 | + with open(json_ofile_map, "a") as output: | ||
333 | + output.write("{") | ||
334 | + sep="" | ||
335 | + for k,v in df_json.items(): | ||
336 | + if v!={}: | ||
337 | + output.write(sep) | ||
338 | + output.write("\""+k+"\"") | ||
339 | + output.write(":") | ||
340 | + record_list = { | ||
341 | + "growth_conditions": v | ||
342 | + } | ||
343 | + json.dump(record_list, output) | ||
344 | + sep="," | ||
345 | + output.write("}") | ||
346 | + | ||
347 | + df=open(json_ofile_map) | ||
348 | + df=json.load(df) | ||
349 | + | ||
350 | + | ||
351 | + full_unmap = merge(npl_df, full_map[["TERM_NAME", "TERM_ID"]], on = ["TERM_NAME"], how='left') | ||
352 | + full_unmap = full_unmap[full_unmap.isna().TERM_ID] | ||
353 | + print(full_unmap.head(3)) | ||
354 | + | ||
355 | + print(f"Total of terms unmapped: {len(full_unmap.index)}") | ||
356 | + | ||
357 | + full_unmap["SOURCE"] = "" | ||
358 | + full_unmap["CASE_MATCH"] = "" | ||
359 | + full_unmap["SET"] = 0 | ||
360 | + full_unmap["SORT"] = 0 | ||
361 | + | ||
362 | + full_unmap.to_csv(full_unmap_ofile, sep = "\t", header =True, index=False) | ||
363 | + | ||
364 | + df_json = defaultdict(list) | ||
365 | + | ||
366 | + for idx,row in full_unmap.iterrows(): | ||
367 | + record = format_fun.created_record(row, no_map=True) | ||
368 | + #record_json = json.dumps(record) | ||
369 | + record_json = record | ||
370 | + df_json[row.SRR].append(record_json) | ||
371 | + if(idx <2): | ||
372 | + print(record_json) | ||
373 | + | ||
374 | + with open(json_ofile_unmap, "a") as output: | ||
375 | + output.write("{") | ||
376 | + sep="" | ||
377 | + for k,v in df_json.items(): | ||
378 | + output.write(sep) | ||
379 | + output.write("\""+k+"\"") | ||
380 | + output.write(":") | ||
381 | + record_list = { | ||
382 | + "growth_conditions": df_json[k] | ||
383 | + } | ||
384 | + json.dump(record_list, output) | ||
385 | + sep="," | ||
386 | + output.write("}") | ||
387 | + | ||
388 | + df=open(json_ofile_unmap) | ||
389 | + df=json.load(df) | ||
390 | + | ||
391 | + | ||
392 | + |
mapping_MCO/bin/mapping2MCO_v5.py
0 → 100755
1 | +# -*- coding: utf-8 -*- | ||
2 | +""" | ||
3 | +#Setup | ||
4 | +""" | ||
5 | + | ||
6 | +#################### Setup #################### | ||
7 | +from optparse import OptionParser | ||
8 | +import os | ||
9 | +from numpy.core.fromnumeric import sort | ||
10 | +from pandas import read_csv, DataFrame, merge, concat, read_table | ||
11 | +from numpy import exp, nan | ||
12 | +import seaborn as sns | ||
13 | +from numpy import mean | ||
14 | + | ||
15 | +import matplotlib.pyplot as plt | ||
16 | +import matplotlib | ||
17 | +matplotlib.style.use('ggplot') | ||
18 | +# %matplotlib inline | ||
19 | + | ||
20 | +from collections import Counter, defaultdict | ||
21 | +import json | ||
22 | + | ||
23 | +from fuzzywuzzy import fuzz | ||
24 | +from fuzzywuzzy import process | ||
25 | + | ||
26 | +#import format_fun | ||
27 | +import format_fun_v6 as format_fun | ||
28 | +import mapping_fun | ||
29 | +import sys | ||
30 | + | ||
31 | +""" | ||
32 | +# input parameters | ||
33 | +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
34 | +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv | ||
35 | +--iOntoFile gc_ontology_terms_v2.txt | ||
36 | +--iLinksFile gc_ontology_terms_link_v2.txt | ||
37 | +--iSynFile mco_terms_v0.2.json | ||
38 | +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ | ||
39 | +--outputFile all_srr_IV_mapped.tsv | ||
40 | +--minPerMatch 90 | ||
41 | + | ||
42 | + | ||
43 | +#Example | ||
44 | +# nohup python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile srr_htregulondb_v2.tsv --minPerMatch 80 --minCRFProbs 0.9 > /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/srr_htregulondb_mapping_report_v2.out & | ||
45 | +""" | ||
46 | +#################### Defining parameters #################### | ||
47 | +if __name__ == "__main__": | ||
48 | + parser = OptionParser() | ||
49 | + parser.add_option( | ||
50 | + "--inputPath", | ||
51 | + dest="input_path", | ||
52 | + help="Path of npl tagged file (crf output)", | ||
53 | + metavar="PATH") | ||
54 | + parser.add_option( | ||
55 | + "--iAnnotatedFile", | ||
56 | + dest="npl_fname", | ||
57 | + help="Input file of npl tagged file (crf output)", | ||
58 | + metavar="FILE", | ||
59 | + default="") | ||
60 | + parser.add_option( | ||
61 | + "--iOntoFile", | ||
62 | + dest="onto_fname", | ||
63 | + help="Input file with the ontology entities", | ||
64 | + metavar="FILE", | ||
65 | + default="") | ||
66 | + parser.add_option( | ||
67 | + "--iLinksFile", | ||
68 | + dest="links_fname", | ||
69 | + help="Input file with links and id for the ontology", | ||
70 | + metavar="FILE", | ||
71 | + default=None) | ||
72 | + parser.add_option( | ||
73 | + "--iSynFile", | ||
74 | + dest="syn_fname", | ||
75 | + help="Input file for the additional ontology of synonyms", | ||
76 | + metavar="FILE", | ||
77 | + default=None) | ||
78 | + parser.add_option( | ||
79 | + "--outputPath", | ||
80 | + dest="output_path", | ||
81 | + help="Output path to place output files", | ||
82 | + metavar="PATH") | ||
83 | + parser.add_option( | ||
84 | + "--outputFile", | ||
85 | + dest="out_fname", | ||
86 | + help="Output file name for the mapping process", | ||
87 | + metavar="FILE", | ||
88 | + default="") | ||
89 | + parser.add_option( | ||
90 | + "--minPerMatch", | ||
91 | + dest="min_score", | ||
92 | + help="Minimal string matching percentage") | ||
93 | + parser.add_option( | ||
94 | + "--minCRFProbs", | ||
95 | + dest="min_probs", | ||
96 | + help="Minimal crf probabilities") | ||
97 | + | ||
98 | + (options, args) = parser.parse_args() | ||
99 | + if len(args) > 0: | ||
100 | + parser.error("Any parameter given.") | ||
101 | + sys.exit(1) | ||
102 | + | ||
103 | + #################### DISP PARAMETERS #################### | ||
104 | + print('\n\n-------------------------------- PARAMETERS --------------------------------\n') | ||
105 | + print("--inputPath Path of npl tagged file: " + str(options.input_path)) | ||
106 | + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname)) | ||
107 | + print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname)) | ||
108 | + print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname)) | ||
109 | + print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname)) | ||
110 | + print("--outputPath Output path to place output files: " + str(options.output_path)) | ||
111 | + print("--outputFile Output of the mapping process: " + str(options.out_fname)) | ||
112 | + print("--minPerMatch Minimal string matching percentage: " + str(options.min_score)) | ||
113 | + print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs)) | ||
114 | + | ||
115 | + print("\n\n") | ||
116 | + repognrl = "http://pakal.ccg.unam.mx/cmendezc" | ||
117 | + reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb" | ||
118 | + repo_url = '/'.join([repognrl,reponame]) | ||
119 | + | ||
120 | + # Input files ======================================================================================== | ||
121 | + min_score = int(options.min_score) | ||
122 | + min_probs = float(options.min_probs) | ||
123 | + npl_ifile = os.path.join(options.input_path, options.npl_fname) | ||
124 | + mco_ifile = os.path.join(options.input_path, options.onto_fname) | ||
125 | + mco_syn_ifile = os.path.join(options.input_path, options.syn_fname) | ||
126 | + | ||
127 | + # Output files ======================================================================================= | ||
128 | + | ||
129 | + #Save by mapping stratergy | ||
130 | + raw_ofname = "_".join(["raw", options.out_fname]) | ||
131 | + rawmap_ofile = os.path.join(options.output_path, raw_ofname) | ||
132 | + str_ofname = "_".join(["sim", options.out_fname]) | ||
133 | + strmap_ofile = os.path.join(options.output_path, str_ofname) | ||
134 | + | ||
135 | + #Saving map und unmap | ||
136 | + full_map_ofile = os.path.join(options.output_path, "full_map_"+options.out_fname) | ||
137 | + full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname) | ||
138 | + | ||
139 | + #Save JSONs | ||
140 | + json_ofile = os.path.join(options.output_path, options.out_fname) | ||
141 | + json_ofile_map = json_ofile.replace(".tsv", "_map.json") | ||
142 | + json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json") | ||
143 | + json_ofile_full = json_ofile.replace(".tsv", "_full.json") | ||
144 | + | ||
145 | + # Load input data ==================================================================================== | ||
146 | + | ||
147 | + #Columns for the NPL-CRF extraction | ||
148 | + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"} | ||
149 | + | ||
150 | + #Load CRF-annotation | ||
151 | + npl_full = read_table(npl_ifile, sep = "\t") | ||
152 | + | ||
153 | + #Check input | ||
154 | + obs_cols = set(npl_full.columns) | ||
155 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
156 | + ocol = ", ".join(list(exp_cols)) | ||
157 | + sys.exit(ocol + " expected columns for iAnnotatedFile" ) | ||
158 | + | ||
159 | + #Filter Input by probs | ||
160 | + npl_df = npl_full[npl_full.PROB >= min_probs] | ||
161 | + npl_df = npl_df.drop_duplicates(keep="first") | ||
162 | + npl_df = npl_df.dropna() | ||
163 | + | ||
164 | + #Cleaning input | ||
165 | + npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE] | ||
166 | + #filter non-mco terms types | ||
167 | + npl_df = npl_df[npl_df.TERM_TYPE != "exTag Type"] | ||
168 | + | ||
169 | + #add repofile_ source. access to stored files at gitLab | ||
170 | + source_access = ['/'.join([repo_url,gse,gse+'.soft.gz']) for gse in npl_df['GSE']] | ||
171 | + npl_df['REPO_FILE'] = source_access | ||
172 | + | ||
173 | + ##remove additional spaces | ||
174 | + npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']] | ||
175 | + npl_df['PMID'] = [pmid.replace("PMID_", "") for pmid in npl_df['PMID']] | ||
176 | + | ||
177 | + #Columns for MCO | ||
178 | + exp_cols = {"TERM_ID", "TERM_NAME"} | ||
179 | + | ||
180 | + #Load MCO term names | ||
181 | + mco_df_full = read_table(mco_ifile, sep = "\t") | ||
182 | + | ||
183 | + #Check input MCO | ||
184 | + obs_cols = set(mco_df_full.columns) | ||
185 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
186 | + sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" ) | ||
187 | + | ||
188 | + #Clean MCO input | ||
189 | + mco_df = mco_df_full[["TERM_ID","TERM_NAME"]] | ||
190 | + mco_df = mco_df.drop_duplicates(keep="first") | ||
191 | + mco_df = mco_df.dropna() | ||
192 | + | ||
193 | + #Load MCO links | ||
194 | + if options.links_fname is not None: | ||
195 | + print("\nLoad types...") | ||
196 | + mcolink_ifile = os.path.join(options.input_path, options.links_fname) | ||
197 | + exp_cols = {"TERM_ID", "TERM_TYPE"} | ||
198 | + mco_links_full = read_table(mcolink_ifile, sep = "\t") | ||
199 | + | ||
200 | + obs_cols = set(mco_links_full.columns) | ||
201 | + | ||
202 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
203 | + sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" ) | ||
204 | + | ||
205 | + mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]] | ||
206 | + mco_links = mco_links.drop_duplicates(keep="first") | ||
207 | + mco_links = mco_links.dropna() | ||
208 | + else: | ||
209 | + mco_links = None | ||
210 | + | ||
211 | + #Load MCO terms synonyms | ||
212 | + mco_json = open(mco_syn_ifile ) | ||
213 | + | ||
214 | + #format json from mco to dataframe | ||
215 | + data = json.load(mco_json) | ||
216 | + mco_syn = format_fun.json2DataFrame(data) | ||
217 | + | ||
218 | + | ||
219 | + print('\n\n-------------------------------- INPUTS --------------------------------\n') | ||
220 | + | ||
221 | + print("\nnpl tagged file\n") | ||
222 | + print(npl_df.head(3)) | ||
223 | + print("\nontology entities\n") | ||
224 | + print(mco_df.head(3)) | ||
225 | + if options.links_fname is not None: | ||
226 | + print("\nlinks and id for the ontology (MCO-type-links)\n") | ||
227 | + print(mco_links.head(3)) | ||
228 | + print("\nadditional ontology of synonyms (MCO-syn-json)\n") | ||
229 | + print(mco_syn.head(3)) | ||
230 | + | ||
231 | + | ||
232 | + print('\n\n-------------------------------- RESULTS --------------------------------\n') | ||
233 | + | ||
234 | + #################### mappping to MCO exact string #################### | ||
235 | + #npl_df = npl_df.drop_duplicates("TERM_NAME", keep="first") | ||
236 | + #npl_df = npl_df.head(10) | ||
237 | + | ||
238 | + print("\nTracking exact terms to MCO...") | ||
239 | + print(f"\nMapping {len(npl_df.index)} terms to MCO based on exact strings...") | ||
240 | + | ||
241 | + #Exact mapping to MCO | ||
242 | + raw_matches = mapping_fun.raw_map_mco( | ||
243 | + npl_df = npl_df, | ||
244 | + mco_df = mco_df, | ||
245 | + mco_links = mco_links, | ||
246 | + unmap = True) | ||
247 | + | ||
248 | + #save file name source of the raw mapping | ||
249 | + raw_matches["SOURCE"] = mco_ifile | ||
250 | + #additional column to merge | ||
251 | + raw_matches["ENTITY_NAME"] = "" | ||
252 | + | ||
253 | + #################### mappping to MCO.syn exact string #################### | ||
254 | + | ||
255 | + #define unmapped | ||
256 | + raw_mco_unmap = raw_matches[raw_matches.isna().TERM_ID] | ||
257 | + #input for te second step | ||
258 | + raw_mco_unmap = raw_mco_unmap[list(npl_df.columns)] | ||
259 | + | ||
260 | + print(f"\nMapping {len(raw_mco_unmap.index)} terms to MCO - synonyms based on exact strings...\n") | ||
261 | + | ||
262 | + #exact mapping to synonims | ||
263 | + raw_matches_syn = mapping_fun.raw_map_mco( | ||
264 | + npl_df = raw_mco_unmap, | ||
265 | + mco_df = mco_syn, | ||
266 | + unmap = True) | ||
267 | + | ||
268 | + #additional column to merge | ||
269 | + raw_matches_syn["SOURCE"] = mco_syn_ifile | ||
270 | + #raw_matches_syn["TERM_TYPE"] = "" | ||
271 | + | ||
272 | + #################### save mapped terms based on exact strings #################### | ||
273 | + | ||
274 | + #all mapped | ||
275 | + raw_map_odf = concat( | ||
276 | + [raw_matches, raw_matches_syn], | ||
277 | + sort=True).dropna() | ||
278 | + | ||
279 | + #print(raw_map_odf.head(3)) | ||
280 | + print(f"Total of terms mapped by exact strings: {len(raw_map_odf.index)}") | ||
281 | + print("Saving filtered terms from raw mapping...\n\n") | ||
282 | + | ||
283 | + raw_map_odf.to_csv( | ||
284 | + rawmap_ofile, | ||
285 | + sep = "\t", | ||
286 | + header =True, | ||
287 | + index=False) | ||
288 | + | ||
289 | + #################### unmmaped raw terms #################### | ||
290 | + raw_mco_syn_unmap = raw_matches_syn[raw_matches_syn.isna().TERM_ID] | ||
291 | + raw_mco_syn_unmap = raw_mco_syn_unmap[list(npl_df.columns)] | ||
292 | + | ||
293 | + print(f"{len(raw_mco_syn_unmap.index)} unmapped terms based on exact strings") | ||
294 | + print("Dropping duplicated unmapped term names...") | ||
295 | + raw_mco_syn_unmap = raw_mco_syn_unmap.drop_duplicates("TERM_NAME") | ||
296 | + | ||
297 | + print(f"{len(raw_mco_syn_unmap.index)} unmapped unique terms based on exact strings") | ||
298 | + | ||
299 | + #################### string similarity mapping #################### | ||
300 | + ###Matching unmaped terms by string similarity | ||
301 | + print("\ncompute string similarty...") | ||
302 | + | ||
303 | + print(f"\nMapping to MCO {len(raw_mco_syn_unmap.index)} terms based on string similarity...") | ||
304 | + | ||
305 | + str_matches = mapping_fun.str_match_map_mco( | ||
306 | + raw_mco_syn_unmap, mco_df, | ||
307 | + mco_links = mco_links, | ||
308 | + min_match=0, | ||
309 | + npl_merges=False) | ||
310 | + | ||
311 | + str_matches_odf = str_matches[str_matches.SET >= min_score] | ||
312 | + str_matches_odf["SOURCE"] = mco_ifile | ||
313 | + | ||
314 | + #################### unmmaped sim terms (MCO) #################### | ||
315 | + str_mco_unmap = str_matches[str_matches.SET < min_score] | ||
316 | + #str_mco_unmap = str_mco_unmap[list(npl_df.columns)] | ||
317 | + str_mco_unmap = str_mco_unmap.drop_duplicates("TERM_NAME") | ||
318 | + | ||
319 | + print(f"\nMapping to MCO - synonyms {len(str_mco_unmap.index)} terms based on string siilarity..\n") | ||
320 | + str_matches_syn = mapping_fun.str_match_map_mco( | ||
321 | + str_mco_unmap, mco_syn, | ||
322 | + min_match=min_score, | ||
323 | + npl_merges=False) | ||
324 | + | ||
325 | + str_matches_syn_odf = str_matches_syn[str_matches_syn.SET >= min_score] | ||
326 | + str_matches_syn_odf["SOURCE"] = mco_syn_ifile | ||
327 | + | ||
328 | + #################### save str-sim map terms #################### | ||
329 | + all_str_matches_odf = concat( | ||
330 | + [str_matches_odf, str_matches_syn_odf], | ||
331 | + sort = True).dropna() | ||
332 | + | ||
333 | + print(f"Unique terms mapped by string similarity: {len(all_str_matches_odf.index)}") | ||
334 | + | ||
335 | + all_str_matches_npl_odf = merge( | ||
336 | + npl_df, all_str_matches_odf, | ||
337 | + on = ["TERM_NAME"], | ||
338 | + how="inner") | ||
339 | + | ||
340 | + print(f"Total of terms mapped by string similarity: {len(all_str_matches_npl_odf.index)}") | ||
341 | + print("Saving filtered terms from str mapping...\n\n") | ||
342 | + | ||
343 | + all_str_matches_npl_odf.to_csv( | ||
344 | + strmap_ofile, | ||
345 | + sep = "\t", | ||
346 | + header =True, | ||
347 | + index=False) | ||
348 | + | ||
349 | + #################### save all map terms #################### | ||
350 | + raw_map_odf["CASE_MATCH"] = "MCO" | ||
351 | + raw_map_odf["SET"] = 100 | ||
352 | + raw_map_odf["SORT"] = 100 | ||
353 | + | ||
354 | + full_map = concat( | ||
355 | + [all_str_matches_npl_odf, raw_map_odf], | ||
356 | + sort = True) | ||
357 | + full_map["MAP"]=True | ||
358 | + | ||
359 | + full_map.to_csv(full_map_ofile, | ||
360 | + sep = "\t", | ||
361 | + header =True, | ||
362 | + index=False) | ||
363 | + | ||
364 | + print("--------------------END----------------------") | ||
365 | + print(f"Total of terms mapped: {len(full_map.index)}\n") | ||
366 | + | ||
367 | + ###################### Merge all unmapped ###################### | ||
368 | + full_unmap = merge(npl_df, full_map[["TERM_NAME", "TERM_ID"]], on = ["TERM_NAME"], how='left') | ||
369 | + full_unmap = full_unmap[full_unmap.isna().TERM_ID] | ||
370 | + #print(full_unmap.head(3)) | ||
371 | + | ||
372 | + print(f"Total of terms unmapped: {len(full_unmap.index)}") | ||
373 | + | ||
374 | + full_unmap["SOURCE"] = "" | ||
375 | + full_unmap["CASE_MATCH"] = "" | ||
376 | + full_unmap["SET"] = 0 | ||
377 | + full_unmap["SORT"] = 0 | ||
378 | + full_unmap["MAP"]=False | ||
379 | + | ||
380 | + full_unmap.to_csv( | ||
381 | + full_unmap_ofile, | ||
382 | + sep = "\t", | ||
383 | + header =True, | ||
384 | + index=False) | ||
385 | + | ||
386 | + #################### Formatting json #################### | ||
387 | + | ||
388 | + format_fun.to_json( | ||
389 | + df = full_map, | ||
390 | + source_info = "GEO", | ||
391 | + evidence_source = "NPL-CRF", | ||
392 | + ofname = json_ofile_map | ||
393 | + ) | ||
394 | + | ||
395 | + | ||
396 | + format_fun.to_json( | ||
397 | + df = full_unmap, | ||
398 | + source_info = "GEO", | ||
399 | + evidence_source = "NPL-CRF", | ||
400 | + ofname = json_ofile_unmap | ||
401 | + ) | ||
402 | + | ||
403 | + #Merge output all | ||
404 | + full_merge = concat([full_map, full_unmap], sort=True) | ||
405 | + format_fun.to_json( | ||
406 | + df = full_merge, | ||
407 | + source_info = "GEO", | ||
408 | + evidence_source = "NPL-CRF", | ||
409 | + ofname = json_ofile_full | ||
410 | + ) | ||
411 | + | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
mapping_MCO/bin/mapping2MCO_v6.py
0 → 100755
1 | +# -*- coding: utf-8 -*- | ||
2 | +""" | ||
3 | +#Setup | ||
4 | +""" | ||
5 | + | ||
6 | +#################### Setup #################### | ||
7 | +from optparse import OptionParser | ||
8 | +import os | ||
9 | +from numpy.core.fromnumeric import sort | ||
10 | +from pandas import read_csv, DataFrame, merge, concat, read_table | ||
11 | +from numpy import exp, nan | ||
12 | +import seaborn as sns | ||
13 | +from numpy import mean | ||
14 | + | ||
15 | +import matplotlib.pyplot as plt | ||
16 | +import matplotlib | ||
17 | +matplotlib.style.use('ggplot') | ||
18 | +# %matplotlib inline | ||
19 | + | ||
20 | +from collections import Counter, defaultdict | ||
21 | +import json | ||
22 | + | ||
23 | +from fuzzywuzzy import fuzz | ||
24 | +from fuzzywuzzy import process | ||
25 | + | ||
26 | +#import format_fun | ||
27 | +import format_fun_v6 as format_fun | ||
28 | +import mapping_fun | ||
29 | +import sys | ||
30 | + | ||
31 | +""" | ||
32 | +# input parameters | ||
33 | +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
34 | +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv | ||
35 | +--iOntoFile gc_ontology_terms_v2.txt | ||
36 | +--iLinksFile gc_ontology_terms_link_v2.txt | ||
37 | +--iSynFile mco_terms_v0.2.json | ||
38 | +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ | ||
39 | +--outputFile all_srr_IV_mapped.tsv | ||
40 | +--minPerMatch 90 | ||
41 | + | ||
42 | + | ||
43 | +#Example | ||
44 | +# nohup python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile srr_htregulondb_v2.tsv --minPerMatch 80 --minCRFProbs 0.9 > /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/srr_htregulondb_mapping_report_v2.out & | ||
45 | +""" | ||
46 | +#################### Defining parameters #################### | ||
47 | +if __name__ == "__main__": | ||
48 | + parser = OptionParser() | ||
49 | + parser.add_option( | ||
50 | + "--inputPath", | ||
51 | + dest="input_path", | ||
52 | + help="Path of npl tagged file (crf output)", | ||
53 | + metavar="PATH") | ||
54 | + parser.add_option( | ||
55 | + "--iAnnotatedFile", | ||
56 | + dest="npl_fname", | ||
57 | + help="Input file of npl tagged file (crf output)", | ||
58 | + metavar="FILE", | ||
59 | + default="") | ||
60 | + parser.add_option( | ||
61 | + "--iOntoFile", | ||
62 | + dest="onto_fname", | ||
63 | + help="Input file with the ontology entities", | ||
64 | + metavar="FILE", | ||
65 | + default="") | ||
66 | + parser.add_option( | ||
67 | + "--iLinksFile", | ||
68 | + dest="links_fname", | ||
69 | + help="Input file with links and id for the ontology", | ||
70 | + metavar="FILE", | ||
71 | + default=None) | ||
72 | + parser.add_option( | ||
73 | + "--iSynFile", | ||
74 | + dest="syn_fname", | ||
75 | + help="Input file for the additional ontology of synonyms", | ||
76 | + metavar="FILE", | ||
77 | + default=None) | ||
78 | + parser.add_option( | ||
79 | + "--outputPath", | ||
80 | + dest="output_path", | ||
81 | + help="Output path to place output files", | ||
82 | + metavar="PATH") | ||
83 | + parser.add_option( | ||
84 | + "--outputFile", | ||
85 | + dest="out_fname", | ||
86 | + help="Output file name for the mapping process", | ||
87 | + metavar="FILE", | ||
88 | + default="") | ||
89 | + parser.add_option( | ||
90 | + "--minPerMatch", | ||
91 | + dest="min_score", | ||
92 | + help="Minimal string matching percentage") | ||
93 | + parser.add_option( | ||
94 | + "--minCRFProbs", | ||
95 | + dest="min_probs", | ||
96 | + help="Minimal crf probabilities") | ||
97 | + | ||
98 | + (options, args) = parser.parse_args() | ||
99 | + if len(args) > 0: | ||
100 | + parser.error("Any parameter given.") | ||
101 | + sys.exit(1) | ||
102 | + | ||
103 | + #################### DISP PARAMETERS #################### | ||
104 | + print('\n\n-------------------------------- PARAMETERS --------------------------------\n') | ||
105 | + print("--inputPath Path of npl tagged file: " + str(options.input_path)) | ||
106 | + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname)) | ||
107 | + print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname)) | ||
108 | + print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname)) | ||
109 | + print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname)) | ||
110 | + print("--outputPath Output path to place output files: " + str(options.output_path)) | ||
111 | + print("--outputFile Output of the mapping process: " + str(options.out_fname)) | ||
112 | + print("--minPerMatch Minimal string matching percentage: " + str(options.min_score)) | ||
113 | + print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs)) | ||
114 | + | ||
115 | + print("\n\n") | ||
116 | + repognrl = "http://pakal.ccg.unam.mx/cmendezc" | ||
117 | + reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb" | ||
118 | + repo_url = '/'.join([repognrl,reponame]) | ||
119 | + | ||
120 | + # Input files ======================================================================================== | ||
121 | + min_score = int(options.min_score) | ||
122 | + min_probs = float(options.min_probs) | ||
123 | + npl_ifile = os.path.join(options.input_path, options.npl_fname) | ||
124 | + mco_ifile = os.path.join(options.input_path, options.onto_fname) | ||
125 | + mco_syn_ifile = os.path.join(options.input_path, options.syn_fname) | ||
126 | + | ||
127 | + # Output files ======================================================================================= | ||
128 | + | ||
129 | + #Save by mapping stratergy | ||
130 | + raw_ofname = "_".join(["raw", options.out_fname]) | ||
131 | + rawmap_ofile = os.path.join(options.output_path, raw_ofname) | ||
132 | + str_ofname = "_".join(["sim", options.out_fname]) | ||
133 | + strmap_ofile = os.path.join(options.output_path, str_ofname) | ||
134 | + | ||
135 | + #Saving map und unmap | ||
136 | + full_map_ofile = os.path.join(options.output_path, "full_map_"+options.out_fname) | ||
137 | + full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname) | ||
138 | + | ||
139 | + #Save JSONs | ||
140 | + json_ofile = os.path.join(options.output_path, options.out_fname) | ||
141 | + json_ofile_map = json_ofile.replace(".tsv", "_map.json") | ||
142 | + json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json") | ||
143 | + json_ofile_full = json_ofile.replace(".tsv", "_full.json") | ||
144 | + | ||
145 | + # Load input data ==================================================================================== | ||
146 | + | ||
147 | + #Columns for the NPL-CRF extraction | ||
148 | + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"} | ||
149 | + | ||
150 | + #Load CRF-annotation | ||
151 | + npl_full = read_table(npl_ifile, sep = "\t") | ||
152 | + | ||
153 | + #Check input | ||
154 | + obs_cols = set(npl_full.columns) | ||
155 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
156 | + ocol = ", ".join(list(exp_cols)) | ||
157 | + sys.exit(ocol + " expected columns for iAnnotatedFile" ) | ||
158 | + | ||
159 | + #Filter Input by probs | ||
160 | + npl_df = npl_full[npl_full.PROB >= min_probs] | ||
161 | + npl_df = npl_df.drop(columns=["PROB"]) | ||
162 | + npl_df = npl_df.drop_duplicates(keep="first") | ||
163 | + npl_df = npl_df.dropna() | ||
164 | + | ||
165 | + #Cleaning input | ||
166 | + npl_df['TERM_TYPE'] = [mapping_fun.transterm_npl2mco(term) for term in npl_df.TERM_TYPE] | ||
167 | + #filter non-mco terms types | ||
168 | + npl_df = npl_df[npl_df.TERM_TYPE != "exTag Type"] | ||
169 | + | ||
170 | + #add repofile_ source. access to stored files at gitLab | ||
171 | + source_access = ['/'.join([repo_url,gse,gse+'.soft.gz']) for gse in npl_df['GSE']] | ||
172 | + npl_df['REPO_FILE'] = source_access | ||
173 | + | ||
174 | + ##remove additional spaces | ||
175 | + npl_df['TERM_NAME'] = [txt.strip() for txt in npl_df['TERM_NAME']] | ||
176 | + npl_df['PMID'] = [pmid.replace("PMID_", "") for pmid in npl_df['PMID']] | ||
177 | + | ||
178 | + #Columns for MCO | ||
179 | + exp_cols = {"TERM_ID", "TERM_NAME"} | ||
180 | + | ||
181 | + #Load MCO term names | ||
182 | + mco_df_full = read_table(mco_ifile, sep = "\t") | ||
183 | + | ||
184 | + #Check input MCO | ||
185 | + obs_cols = set(mco_df_full.columns) | ||
186 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
187 | + sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" ) | ||
188 | + | ||
189 | + #Clean MCO input | ||
190 | + mco_df = mco_df_full[["TERM_ID","TERM_NAME"]] | ||
191 | + mco_df = mco_df.drop_duplicates(keep="first") | ||
192 | + mco_df = mco_df.dropna() | ||
193 | + | ||
194 | + #Load MCO links | ||
195 | + if options.links_fname is not None: | ||
196 | + print("\nLoad types...") | ||
197 | + mcolink_ifile = os.path.join(options.input_path, options.links_fname) | ||
198 | + exp_cols = {"TERM_ID", "TERM_TYPE"} | ||
199 | + mco_links_full = read_table(mcolink_ifile, sep = "\t") | ||
200 | + | ||
201 | + obs_cols = set(mco_links_full.columns) | ||
202 | + | ||
203 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
204 | + sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" ) | ||
205 | + | ||
206 | + mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]] | ||
207 | + mco_links = mco_links.drop_duplicates(keep="first") | ||
208 | + mco_links = mco_links.dropna() | ||
209 | + else: | ||
210 | + mco_links = None | ||
211 | + | ||
212 | + #Load MCO terms synonyms | ||
213 | + mco_json = open(mco_syn_ifile ) | ||
214 | + | ||
215 | + #format json from mco to dataframe | ||
216 | + data = json.load(mco_json) | ||
217 | + mco_syn = format_fun.json2DataFrame(data) | ||
218 | + | ||
219 | + | ||
220 | + print('\n\n-------------------------------- INPUTS --------------------------------\n') | ||
221 | + | ||
222 | + print("\nnpl tagged file\n") | ||
223 | + print(npl_df.head(3)) | ||
224 | + print("\nontology entities\n") | ||
225 | + print(mco_df.head(3)) | ||
226 | + if options.links_fname is not None: | ||
227 | + print("\nlinks and id for the ontology (MCO-type-links)\n") | ||
228 | + print(mco_links.head(3)) | ||
229 | + print("\nadditional ontology of synonyms (MCO-syn-json)\n") | ||
230 | + print(mco_syn.head(3)) | ||
231 | + | ||
232 | + | ||
233 | + print('\n\n-------------------------------- RESULTS --------------------------------\n') | ||
234 | + | ||
235 | + #################### mappping to MCO exact string #################### | ||
236 | + #npl_df = npl_df.drop_duplicates("TERM_NAME", keep="first") | ||
237 | + #npl_df = npl_df.head(10) | ||
238 | + | ||
239 | + print("\nTracking exact terms to MCO...") | ||
240 | + print(f"\nMapping {len(npl_df.index)} terms to MCO based on exact strings...") | ||
241 | + | ||
242 | + #Exact mapping to MCO | ||
243 | + raw_matches = mapping_fun.raw_map_mco( | ||
244 | + npl_df = npl_df, | ||
245 | + mco_df = mco_df, | ||
246 | + mco_links = mco_links, | ||
247 | + unmap = True) | ||
248 | + | ||
249 | + #save file name source of the raw mapping | ||
250 | + raw_matches["SOURCE"] = mco_ifile | ||
251 | + #additional column to merge | ||
252 | + raw_matches["ENTITY_NAME"] = "" | ||
253 | + | ||
254 | + #################### mappping to MCO.syn exact string #################### | ||
255 | + | ||
256 | + #define unmapped | ||
257 | + raw_mco_unmap = raw_matches[raw_matches.isna().TERM_ID] | ||
258 | + #input for te second step | ||
259 | + raw_mco_unmap = raw_mco_unmap[list(npl_df.columns)] | ||
260 | + | ||
261 | + print(f"\nMapping {len(raw_mco_unmap.index)} terms to MCO - synonyms based on exact strings...\n") | ||
262 | + | ||
263 | + #exact mapping to synonims | ||
264 | + raw_matches_syn = mapping_fun.raw_map_mco( | ||
265 | + npl_df = raw_mco_unmap, | ||
266 | + mco_df = mco_syn, | ||
267 | + unmap = True) | ||
268 | + | ||
269 | + #additional column to merge | ||
270 | + raw_matches_syn["SOURCE"] = mco_syn_ifile | ||
271 | + #raw_matches_syn["TERM_TYPE"] = "" | ||
272 | + | ||
273 | + #################### save mapped terms based on exact strings #################### | ||
274 | + | ||
275 | + #all mapped | ||
276 | + raw_map_odf = concat( | ||
277 | + [raw_matches, raw_matches_syn], | ||
278 | + sort=True).dropna() | ||
279 | + | ||
280 | + #print(raw_map_odf.head(3)) | ||
281 | + print(f"Total of terms mapped by exact strings: {len(raw_map_odf.index)}") | ||
282 | + print("Saving filtered terms from raw mapping...\n\n") | ||
283 | + | ||
284 | + raw_map_odf.to_csv( | ||
285 | + rawmap_ofile, | ||
286 | + sep = "\t", | ||
287 | + header =True, | ||
288 | + index=False) | ||
289 | + | ||
290 | + #################### unmmaped raw terms #################### | ||
291 | + raw_mco_syn_unmap = raw_matches_syn[raw_matches_syn.isna().TERM_ID] | ||
292 | + raw_mco_syn_unmap = raw_mco_syn_unmap[list(npl_df.columns)] | ||
293 | + | ||
294 | + print(f"{len(raw_mco_syn_unmap.index)} unmapped terms based on exact strings") | ||
295 | + print("Dropping duplicated unmapped term names...") | ||
296 | + raw_mco_syn_unmap = raw_mco_syn_unmap.drop_duplicates("TERM_NAME") | ||
297 | + | ||
298 | + print(f"{len(raw_mco_syn_unmap.index)} unmapped unique terms based on exact strings") | ||
299 | + | ||
300 | + #################### string similarity mapping #################### | ||
301 | + ###Matching unmaped terms by string similarity | ||
302 | + print("\ncompute string similarty...") | ||
303 | + | ||
304 | + print(f"\nMapping to MCO {len(raw_mco_syn_unmap.index)} terms based on string similarity...") | ||
305 | + | ||
306 | + str_matches = mapping_fun.str_match_map_mco( | ||
307 | + raw_mco_syn_unmap, mco_df, | ||
308 | + mco_links = mco_links, | ||
309 | + min_match=0, | ||
310 | + npl_merges=False) | ||
311 | + | ||
312 | + str_matches_odf = str_matches[str_matches.SET >= min_score] | ||
313 | + str_matches_odf["SOURCE"] = mco_ifile | ||
314 | + | ||
315 | + #################### unmmaped sim terms (MCO) #################### | ||
316 | + str_mco_unmap = str_matches[str_matches.SET < min_score] | ||
317 | + #str_mco_unmap = str_mco_unmap[list(npl_df.columns)] | ||
318 | + str_mco_unmap = str_mco_unmap.drop_duplicates("TERM_NAME") | ||
319 | + | ||
320 | + print(f"\nMapping to MCO - synonyms {len(str_mco_unmap.index)} terms based on string siilarity..\n") | ||
321 | + str_matches_syn = mapping_fun.str_match_map_mco( | ||
322 | + str_mco_unmap, mco_syn, | ||
323 | + min_match=min_score, | ||
324 | + npl_merges=False) | ||
325 | + | ||
326 | + str_matches_syn_odf = str_matches_syn[str_matches_syn.SET >= min_score] | ||
327 | + str_matches_syn_odf["SOURCE"] = mco_syn_ifile | ||
328 | + | ||
329 | + #################### save str-sim map terms #################### | ||
330 | + all_str_matches_odf = concat( | ||
331 | + [str_matches_odf, str_matches_syn_odf], | ||
332 | + sort = True).dropna() | ||
333 | + | ||
334 | + print(f"Unique terms mapped by string similarity: {len(all_str_matches_odf.index)}") | ||
335 | + | ||
336 | + all_str_matches_npl_odf = merge( | ||
337 | + npl_df, all_str_matches_odf, | ||
338 | + on = ["TERM_NAME"], | ||
339 | + how="inner") | ||
340 | + | ||
341 | + print(f"Total of terms mapped by string similarity: {len(all_str_matches_npl_odf.index)}") | ||
342 | + print("Saving filtered terms from str mapping...\n\n") | ||
343 | + | ||
344 | + all_str_matches_npl_odf.to_csv( | ||
345 | + strmap_ofile, | ||
346 | + sep = "\t", | ||
347 | + header =True, | ||
348 | + index=False) | ||
349 | + | ||
350 | + #################### save all map terms #################### | ||
351 | + raw_map_odf["CASE_MATCH"] = "MCO" | ||
352 | + raw_map_odf["SET"] = 100 | ||
353 | + raw_map_odf["SORT"] = 100 | ||
354 | + | ||
355 | + full_map = concat( | ||
356 | + [all_str_matches_npl_odf, raw_map_odf], | ||
357 | + sort = True) | ||
358 | + full_map["MAP"]=True | ||
359 | + | ||
360 | + full_map.to_csv(full_map_ofile, | ||
361 | + sep = "\t", | ||
362 | + header =True, | ||
363 | + index=False) | ||
364 | + | ||
365 | + print("--------------------END----------------------") | ||
366 | + print(f"Total of terms mapped: {len(full_map.index)}\n") | ||
367 | + | ||
368 | + ###################### Merge all unmapped ###################### | ||
369 | + full_unmap = merge(npl_df, full_map[["TERM_NAME", "TERM_ID"]], on = ["TERM_NAME"], how='left') | ||
370 | + full_unmap = full_unmap[full_unmap.isna().TERM_ID] | ||
371 | + #print(full_unmap.head(3)) | ||
372 | + | ||
373 | + print(f"Total of terms unmapped: {len(full_unmap.index)}") | ||
374 | + | ||
375 | + full_unmap["SOURCE"] = "" | ||
376 | + full_unmap["CASE_MATCH"] = "" | ||
377 | + full_unmap["SET"] = 0 | ||
378 | + full_unmap["SORT"] = 0 | ||
379 | + full_unmap["MAP"]=False | ||
380 | + | ||
381 | + full_unmap.to_csv( | ||
382 | + full_unmap_ofile, | ||
383 | + sep = "\t", | ||
384 | + header =True, | ||
385 | + index=False) | ||
386 | + | ||
387 | + #################### Formatting json #################### | ||
388 | + | ||
389 | + format_fun.to_json( | ||
390 | + df = full_map, | ||
391 | + source_info = "GEO", | ||
392 | + evidence_source = "NPL-CRF", | ||
393 | + ofname = json_ofile_map | ||
394 | + ) | ||
395 | + | ||
396 | + | ||
397 | + format_fun.to_json( | ||
398 | + df = full_unmap, | ||
399 | + source_info = "GEO", | ||
400 | + evidence_source = "NPL-CRF", | ||
401 | + ofname = json_ofile_unmap | ||
402 | + ) | ||
403 | + | ||
404 | + #Merge output all | ||
405 | + full_merge = concat([full_map, full_unmap], sort=True) | ||
406 | + format_fun.to_json( | ||
407 | + df = full_merge, | ||
408 | + source_info = "GEO", | ||
409 | + evidence_source = "NPL-CRF", | ||
410 | + ofname = json_ofile_full | ||
411 | + ) | ||
412 | + |
1 | +from pandas import read_csv, merge | ||
2 | +crf_output_file = "/home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/srr_htregulondb/srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv" | ||
3 | +annot_file = "/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv" | ||
4 | +filter_ofile = "/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv" | ||
5 | + | ||
6 | +annot = read_csv(annot_file, names = ["TERM_TYPE", "TERM_NAME"] ) | ||
7 | +annot.TERM_NAME = [text.strip() for text in annot.TERM_NAME] | ||
8 | +crf_ouput = read_csv(crf_output_file, | ||
9 | + names = ["SRR","GSE","GSM","GPL","PMID","GSM_NAME", | ||
10 | + "GSE_NAME","GPL_NAME","BANGLINE", | ||
11 | + "SOURCE_TEXT_CTRL","FULL_TEXT","TERM_NAME", | ||
12 | + "TERM_TYPE","PROB"], | ||
13 | + sep = "\t") | ||
14 | +crf_ouput.TERM_NAME = [text.strip() for text in crf_ouput.TERM_NAME] | ||
15 | + | ||
16 | +paso1 = merge( annot,crf_ouput, on = ["TERM_TYPE", "TERM_NAME"] ) | ||
17 | +paso1 = paso1.reindex(columns = ["SRR","GSE","GSM","GPL","PMID","GSM_NAME", | ||
18 | + "GSE_NAME","GPL_NAME","BANGLINE", | ||
19 | + "SOURCE_TEXT_CTRL","FULL_TEXT","TERM_NAME", | ||
20 | + "TERM_TYPE","PROB"]) | ||
21 | + | ||
22 | +paso1.to_csv(filter_ofile,sep="\t",index=False,header=True) |
This diff could not be displayed because it is too large.
1 | +GC_type,GC_term | ||
2 | +Agit,250 rpm | ||
3 | +Air,Aerobic | ||
4 | +Air,Aerobic and anaerobic | ||
5 | +Air,aerobically | ||
6 | +Air,anaerobic | ||
7 | +Gtype,{ delta } baeR | ||
8 | +Gtype,{ delta } cpxR | ||
9 | +Gtype,{ delta } cspABCEG | ||
10 | +Gtype,{ delta } cspABEG | ||
11 | +Gtype,{ delta } cspBG | ||
12 | +Gtype,{ delta } hns | ||
13 | +Gtype,{ delta } kdpE | ||
14 | +Gtype,{ delta } nusG | ||
15 | +Gtype,{ delta } perC : : kanR | ||
16 | +Gtype,{ delta } phoB | ||
17 | +Gtype,{ delta } rho | ||
18 | +Gtype,{ delta } rnr | ||
19 | +Gtype,{ delta } zraR | ||
20 | +Gtype,: φO104 | ||
21 | +Gtype,DH5α ( pAR060302 ) | ||
22 | +Gtype,E.coli K12 BW25113 | ||
23 | +Gtype,K12 MG1655 | ||
24 | +Gtype,K12 MG1655 deltaprfC | ||
25 | +Gtype,K12 MG1655 prfB-Bstrain allele | ||
26 | +Gtype,K12 MG1655 prfB-Bstrain allele deltaprfC | ||
27 | +Gtype,K12 MG1657 | ||
28 | +Gtype,K12 MG1667 | ||
29 | +Gtype,K12 MG1668 | ||
30 | +Gtype,K12 MG1672 | ||
31 | +Gtype,K12 MG1673 | ||
32 | +Gtype,K12 MG1674 | ||
33 | +Gtype,K12 W3110 | ||
34 | +Gtype,MC4100 ∆ tig : : kan pTig-TEV-Avi | ||
35 | +Gtype,O157 : H7 NCTC 12900 | ||
36 | +Gtype,PNPase mutant | ||
37 | +Gtype,Pck over-expressed | ||
38 | +Gtype,RNase II mutant | ||
39 | +Gtype,RNase R mutant | ||
40 | +Gtype,W3110 6xHis-rpoD | ||
41 | +Gtype,W3110 6xHis-rpoD greA : : tet greB : : amp | ||
42 | +Gtype,"W3110 rpoC-6xHis : : kan greA : : tet , greB : : amp" | ||
43 | +Gtype,WT | ||
44 | +Gtype,WT WT | ||
45 | +Gtype,Wild type | ||
46 | +Gtype,Wild-type | ||
47 | +Gtype,Wildtype | ||
48 | +Gtype,cra KO ; BW25113 Dcra | ||
49 | +Gtype,cya mutant background | ||
50 | +Gtype,delta Crp | ||
51 | +Gtype,delta _ cra | ||
52 | +Gtype,delta-gadE | ||
53 | +Gtype,delta-gadW | ||
54 | +Gtype,delta-gadX | ||
55 | +Gtype,delta-oxyR | ||
56 | +Gtype,delta-soxR | ||
57 | +Gtype,delta-soxS | ||
58 | +Gtype,fepA knockout | ||
59 | +Gtype,fis mutant background | ||
60 | +Gtype,lacA knockout | ||
61 | +Gtype,lack Fis protein | ||
62 | +Gtype,lack H-NS protein | ||
63 | +Gtype,naive ( wild type ) | ||
64 | +Gtype,ompR deletion mutant | ||
65 | +Gtype,phageO104 in the wrbA gene | ||
66 | +Gtype,phagePA8 in the argW gene | ||
67 | +Gtype,rng mutant | ||
68 | +Gtype,sdhC knockout | ||
69 | +Gtype,sigma70 WT | ||
70 | +Gtype,wild type | ||
71 | +Gtype,wild type ; MG1655 | ||
72 | +Gtype,wild-type | ||
73 | +Gtype,wildtype | ||
74 | +Gtype,wt | ||
75 | +Gtype,yafC deletion | ||
76 | +Gtype,ybaO deletion | ||
77 | +Gtype,ybaQ deletion | ||
78 | +Gtype,ybiH deletion | ||
79 | +Gtype,ydcI deletion | ||
80 | +Gtype,yddM deletion | ||
81 | +Gtype,yeiE deletion | ||
82 | +Gtype,yheO deletion | ||
83 | +Gtype,yiaJ deletion | ||
84 | +Gtype,yieP deletion | ||
85 | +Gtype,Δcra | ||
86 | +Gtype,Δfur | ||
87 | +Gtype,ΔgadE | ||
88 | +Gtype,ΔgadW | ||
89 | +Gtype,ΔgadX | ||
90 | +Gtype,ΔoxyR | ||
91 | +Gtype,ΔsoxR | ||
92 | +Gtype,ΔsoxS | ||
93 | +Gtype,∆ cspABCEG | ||
94 | +Gtype,∆ cspABEG | ||
95 | +Gtype,∆ cspBG | ||
96 | +Gtype,∆ hfq : : cat ) | ||
97 | +Gtype,∆ rnr | ||
98 | +Med,Bertani ( LB ) medium | ||
99 | +Med,Davis Minimal medium | ||
100 | +Med,LB | ||
101 | +Med,LB media | ||
102 | +Med,LB medium | ||
103 | +Med,"LB medium ," | ||
104 | +Med,M9 + 4 g/L glc ( glucose minimal media ) | ||
105 | +Med,M9 minimal media | ||
106 | +Med,M9 minimal medium | ||
107 | +Med,MOPS complete-glucose liquid media | ||
108 | +Med,MOPS glucose minimal medium | ||
109 | +Med,MOPS medium | ||
110 | +Med,Neidhardt MOPS Minimal Medium ( NM3 ) | ||
111 | +Med,SB medium | ||
112 | +Med,SILAC | ||
113 | +Med,W2 minimal media | ||
114 | +Med,fresh DM500 | ||
115 | +Med,fully supplemented MOPS glucose media | ||
116 | +Med,glucose-M9 minimal media | ||
117 | +Med,glucose-limited minimal medium | ||
118 | +Med,in fresh LB medium | ||
119 | +Med,minimal medium | ||
120 | +OD,O.D. 600nm 0.5 | ||
121 | +OD,OD600 = 0.3 | ||
122 | +OD,OD600 of about 0.8 | ||
123 | +Phase,IspG1 strain | ||
124 | +Phase,exponential | ||
125 | +Phase,log phase | ||
126 | +Phase,log phase sample | ||
127 | +Phase,mid-log phase | ||
128 | +Phase,stationary | ||
129 | +Phase,stationary phase | ||
130 | +Supp,0.1 mM KCl | ||
131 | +Supp,0.2 % arabinose | ||
132 | +Supp,0.2 % glucose | ||
133 | +Supp,0.2 % glutamine | ||
134 | +Supp,0.2 mM of DPD | ||
135 | +Supp,0.3 % glucose | ||
136 | +Supp,0.3 M of NaCl | ||
137 | +Supp,0.4 % glucose | ||
138 | +Supp,0.5 % glucose | ||
139 | +Supp,100 μM IPTG | ||
140 | +Supp,1mM IPTG | ||
141 | +Supp,2 mM Hydrogen peroxide | ||
142 | +Supp,22 mM glucose | ||
143 | +Supp,250 uM of paraquat | ||
144 | +Supp,2g/L glucose | ||
145 | +Supp,2g/L glucose and 1 mM cytidine | ||
146 | +Supp,4g/L glucose | ||
147 | +Supp,50 µM NiCl2 | ||
148 | +Supp,70 µM IPTG | ||
149 | +Supp,DPD | ||
150 | +Supp,Fe | ||
151 | +Supp,IPTG | ||
152 | +Supp,IPTG was | ||
153 | +Supp,L-trp | ||
154 | +Supp,Xgal and IPTG | ||
155 | +Supp,acetate | ||
156 | +Supp,ade | ||
157 | +Supp,arabinose | ||
158 | +Supp,fructose | ||
159 | +Supp,glucose | ||
160 | +Supp,glutamine | ||
161 | +Supp,induced 50 µM IPTG | ||
162 | +Supp,mM IPTG | ||
163 | +Supp,mM IPTG + 50μg/ml Amp | ||
164 | +Supp,rhamnose | ||
165 | +Supp,rifampicin | ||
166 | +Supp,rifampicin and | ||
167 | +Supp,rifampicin time point | ||
168 | +Supp,rifampicin time point 0 | ||
169 | +Supp,rifampicin time point 4 | ||
170 | +Supp,rifampicin time point 6 | ||
171 | +Supp,rifampicin time point 8 | ||
172 | +Temp,10 °C | ||
173 | +Temp,30 °C | ||
174 | +Temp,37 °C | ||
175 | +Temp,37 ℃ | ||
176 | +Temp,42 °C | ||
177 | +pH,pH 5.5 | ||
178 | +pH,pH5 .5 |
mapping_MCO/input/format_zika_v3.py
0 → 100755
1 | +# -*- coding: utf-8 -*- | ||
2 | +""" | ||
3 | +#Setup | ||
4 | +""" | ||
5 | + | ||
6 | +#################### Setup #################### | ||
7 | +from collections import defaultdict | ||
8 | +from optparse import OptionParser | ||
9 | +import os | ||
10 | +from numpy.core.fromnumeric import sort | ||
11 | +from pandas import read_csv, DataFrame, merge, concat, read_table | ||
12 | +from numpy import exp, nan | ||
13 | +import seaborn as sns | ||
14 | +from numpy import mean | ||
15 | + | ||
16 | +import matplotlib.pyplot as plt | ||
17 | +import matplotlib | ||
18 | +matplotlib.style.use('ggplot') | ||
19 | +# %matplotlib inline | ||
20 | + | ||
21 | +from collections import Counter | ||
22 | +import json | ||
23 | + | ||
24 | +from fuzzywuzzy import fuzz | ||
25 | +from fuzzywuzzy import process | ||
26 | + | ||
27 | +import format_fun | ||
28 | +import mapping_fun | ||
29 | +import sys | ||
30 | + | ||
31 | +""" | ||
32 | +# input parameters | ||
33 | +--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
34 | +--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv | ||
35 | +--iOntoFile gc_ontology_terms_v2.txt | ||
36 | +--iLinksFile gc_ontology_terms_link_v2.txt | ||
37 | +--iSynFile mco_terms_v0.2.json | ||
38 | +--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ | ||
39 | +--outputFile all_srr_IV_mapped.tsv | ||
40 | +--minPerMatch 90 | ||
41 | + | ||
42 | + | ||
43 | +#Example | ||
44 | +# nohup python3 mapping2MCO_v3.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile srr_htregulondb_mapped.tsv --minPerMatch 80 --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out & | ||
45 | +""" | ||
46 | +#################### Defining parameters #################### | ||
47 | +if __name__ == "__main__": | ||
48 | + parser = OptionParser() | ||
49 | + parser.add_option( | ||
50 | + "--inputPath", | ||
51 | + dest="input_path", | ||
52 | + help="Path of npl tagged file (crf output)", | ||
53 | + metavar="PATH") | ||
54 | + parser.add_option( | ||
55 | + "--iAnnotatedFile", | ||
56 | + dest="npl_fname", | ||
57 | + help="Input file of npl tagged file (crf output)", | ||
58 | + metavar="FILE", | ||
59 | + default="") | ||
60 | + parser.add_option( | ||
61 | + "--iOntoFile", | ||
62 | + dest="onto_fname", | ||
63 | + help="Input file with the ontology entities", | ||
64 | + metavar="FILE", | ||
65 | + default="") | ||
66 | + parser.add_option( | ||
67 | + "--iLinksFile", | ||
68 | + dest="links_fname", | ||
69 | + help="Input file with links and id for the ontology", | ||
70 | + metavar="FILE", | ||
71 | + default=None) | ||
72 | + parser.add_option( | ||
73 | + "--iSynFile", | ||
74 | + dest="syn_fname", | ||
75 | + help="Input file for the additional ontology of synonyms", | ||
76 | + metavar="FILE", | ||
77 | + default=None) | ||
78 | + parser.add_option( | ||
79 | + "--outputPath", | ||
80 | + dest="output_path", | ||
81 | + help="Output path to place output files", | ||
82 | + metavar="PATH") | ||
83 | + parser.add_option( | ||
84 | + "--outputFile", | ||
85 | + dest="out_fname", | ||
86 | + help="Output file name for the mapping process", | ||
87 | + metavar="FILE", | ||
88 | + default="") | ||
89 | + parser.add_option( | ||
90 | + "--minPerMatch", | ||
91 | + dest="min_score", | ||
92 | + help="Minimal string matching percentage") | ||
93 | + parser.add_option( | ||
94 | + "--minCRFProbs", | ||
95 | + dest="min_probs", | ||
96 | + help="Minimal crf probabilities") | ||
97 | + | ||
98 | + (options, args) = parser.parse_args() | ||
99 | + if len(args) > 0: | ||
100 | + parser.error("Any parameter given.") | ||
101 | + sys.exit(1) | ||
102 | + | ||
103 | + #################### DISP PARAMETERS #################### | ||
104 | + print('\n\n-------------------------------- PARAMETERS --------------------------------\n') | ||
105 | + print("--inputPath Path of npl tagged file: " + str(options.input_path)) | ||
106 | + print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname)) | ||
107 | + print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname)) | ||
108 | + print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname)) | ||
109 | + print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname)) | ||
110 | + print("--outputPath Output path to place output files: " + str(options.output_path)) | ||
111 | + print("--outputFile Output of the mapping process: " + str(options.out_fname)) | ||
112 | + print("--minPerMatch Minimal string matching percentage: " + str(options.min_score)) | ||
113 | + print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs)) | ||
114 | + | ||
115 | + print("\n\n") | ||
116 | + repognrl = "http://pakal.ccg.unam.mx/cmendezc" | ||
117 | + reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb" | ||
118 | + repo_url = '/'.join([repognrl,reponame]) | ||
119 | + | ||
120 | + # Input files | ||
121 | + min_score = int(options.min_score) | ||
122 | + min_probs = float(options.min_probs) | ||
123 | + npl_ifile = os.path.join(options.input_path, options.npl_fname) | ||
124 | + mco_ifile = os.path.join(options.input_path, options.onto_fname) | ||
125 | + mco_syn_ifile = os.path.join(options.input_path, options.syn_fname) | ||
126 | + | ||
127 | + #Output files | ||
128 | + raw_ofname = "_".join(["raw", options.out_fname]) | ||
129 | + rawmap_ofile = os.path.join(options.output_path, raw_ofname) | ||
130 | + str_ofname = "_".join(["sim", options.out_fname]) | ||
131 | + strmap_ofile = os.path.join(options.output_path, str_ofname) | ||
132 | + | ||
133 | + full_ofile = os.path.join(options.output_path, "full_"+options.out_fname) | ||
134 | + full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname) | ||
135 | + | ||
136 | + json_ofile = os.path.join(options.output_path, options.out_fname) | ||
137 | + json_ofile_map = json_ofile.replace(".tsv", "_map.json") | ||
138 | + json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json") | ||
139 | + | ||
140 | + #################### Load input data #################### | ||
141 | + # Load CRF-annotation | ||
142 | + exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"} | ||
143 | + npl_full = read_table(npl_ifile, sep = "\t") | ||
144 | + | ||
145 | + obs_cols = set(npl_full.columns) | ||
146 | + | ||
147 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
148 | + ocol = ", ".join(list(exp_cols)) | ||
149 | + sys.exit(ocol + " expected columns for iAnnotatedFile" ) | ||
150 | + | ||
151 | + #Load MCO term names | ||
152 | + exp_cols = {"TERM_ID", "TERM_NAME"} | ||
153 | + mco_df_full = read_table(mco_ifile, sep = "\t") | ||
154 | + obs_cols = set(mco_df_full.columns) | ||
155 | + | ||
156 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
157 | + sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" ) | ||
158 | + | ||
159 | + mco_df = mco_df_full[["TERM_ID","TERM_NAME"]] | ||
160 | + mco_df = mco_df.drop_duplicates(keep="first") | ||
161 | + mco_df = mco_df.dropna() | ||
162 | + | ||
163 | + #Load MCO links | ||
164 | + if options.links_fname is not None: | ||
165 | + print("\nLoad types...") | ||
166 | + mcolink_ifile = os.path.join(options.input_path, options.links_fname) | ||
167 | + exp_cols = {"TERM_ID", "TERM_TYPE"} | ||
168 | + mco_links_full = read_table(mcolink_ifile, sep = "\t") | ||
169 | + | ||
170 | + obs_cols = set(mco_links_full.columns) | ||
171 | + | ||
172 | + if exp_cols.intersection(obs_cols) != exp_cols: | ||
173 | + sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" ) | ||
174 | + | ||
175 | + mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]] | ||
176 | + mco_links = mco_links.drop_duplicates(keep="first") | ||
177 | + mco_links = mco_links.dropna() | ||
178 | + else: | ||
179 | + mco_links = None | ||
180 | + | ||
181 | + #Load MCO terms synonyms | ||
182 | + #format json from mco to dataframe | ||
183 | + mco_json = open(mco_syn_ifile ) | ||
184 | + data = json.load(mco_json) | ||
185 | + mco_syn = format_fun.json2DataFrame(data) | ||
186 | + | ||
187 | + df_json = defaultdict(list) | ||
188 | + | ||
189 | + for idx,row in full_unmap.iterrows(): | ||
190 | + record = format_fun.created_record(row), output) | ||
191 | + df_json[row.SRR].append(record) | ||
192 | + | ||
193 | + df_json | ||
194 | + with open(json_ofile_list, "w") as output: | ||
195 | + json.dump(format_fun.created_record(df_json), output) | ||
196 | + | ||
197 | + with open(json_ofile_df_list, "a") as output: | ||
198 | + for idx,row in df_json.items(): | ||
199 | + json.dump(format_fun.created_record(row), output) | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
File moved
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 | +/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning | ||
2 | + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') | ||
3 | +/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning: | ||
4 | +A value is trying to be set on a copy of a slice from a DataFrame. | ||
5 | +Try using .loc[row_indexer,col_indexer] = value instead | ||
6 | + | ||
7 | +See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy | ||
8 | + str_matches_odf["SOURCE"] = mco_ifile | ||
9 | + | ||
10 | + | ||
11 | +-------------------------------- PARAMETERS -------------------------------- | ||
12 | + | ||
13 | +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
14 | +--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv | ||
15 | +--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt | ||
16 | +--iLinksFile Input file with links and id for the ontology (MCO-type-links): None | ||
17 | +--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json | ||
18 | +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ | ||
19 | +--outputFile Output of the mapping process: srr_htregulondb.tsv | ||
20 | +--minPerMatch Minimal string matching percentage: 80 | ||
21 | +--minCRFProbs Minimal crf probabilities allowed: 0.9 | ||
22 | + | ||
23 | + | ||
24 | + | ||
25 | + | ||
26 | + | ||
27 | +-------------------------------- INPUTS -------------------------------- | ||
28 | + | ||
29 | + | ||
30 | +npl tagged file | ||
31 | + | ||
32 | + SRR ... REPO_FILE | ||
33 | +0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
34 | +5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
35 | +7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
36 | + | ||
37 | +[3 rows x 15 columns] | ||
38 | + | ||
39 | +ontology entities | ||
40 | + | ||
41 | + TERM_ID TERM_NAME | ||
42 | +0 MCO000000014 generically dependent continuant | ||
43 | +1 MCO000000015 radiation | ||
44 | +2 MCO000000016 electromagnetic radiation | ||
45 | + | ||
46 | +additional ontology of synonyms (MCO-syn-json) | ||
47 | + | ||
48 | + ENTITY_NAME TERM_ID TERM_NAME | ||
49 | +MCO000000019 continuant MCO000000019 | ||
50 | +MCO000002475 culture medium MCO000002475 | ||
51 | +MCO000002467_0 Organism MCO000002467 biologicentity | ||
52 | + | ||
53 | + | ||
54 | +-------------------------------- RESULTS -------------------------------- | ||
55 | + | ||
56 | + | ||
57 | +Tracking exact terms to MCO... | ||
58 | + | ||
59 | +Mapping 4099 terms to MCO based on exact strings... | ||
60 | + | ||
61 | +Mapping 3770 terms to MCO - synonyms based on exact strings... | ||
62 | + | ||
63 | +Total of terms mapped by exact strings: 387 | ||
64 | +Saving filtered terms from raw mapping... | ||
65 | + | ||
66 | + | ||
67 | +3712 unmapped terms based on exact strings | ||
68 | +Dropping duplicated unmapped term names... | ||
69 | +206 unmapped unique terms based on exact strings | ||
70 | + | ||
71 | +compute string similarty... | ||
72 | + | ||
73 | +Mapping to MCO 206 terms based on string similarity... | ||
74 | + | ||
75 | +Mapping to MCO - synonyms 152 terms based on string siilarity.. | ||
76 | + | ||
77 | +Unique terms mapped by string similarity: 73 | ||
78 | +Total of terms mapped by string similarity: 1992 | ||
79 | +Saving filtered terms from str mapping... | ||
80 | + | ||
81 | + | ||
82 | +--------------------END---------------------- | ||
83 | +Total of terms mapped: 2379 | ||
84 | + | ||
85 | +Total of terms unmapped: 1720 |
1 | +/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning | ||
2 | + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') | ||
3 | +/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning: | ||
4 | +A value is trying to be set on a copy of a slice from a DataFrame. | ||
5 | +Try using .loc[row_indexer,col_indexer] = value instead | ||
6 | + | ||
7 | +See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy | ||
8 | + str_matches_odf["SOURCE"] = mco_ifile | ||
9 | + | ||
10 | + | ||
11 | +-------------------------------- PARAMETERS -------------------------------- | ||
12 | + | ||
13 | +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
14 | +--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv | ||
15 | +--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt | ||
16 | +--iLinksFile Input file with links and id for the ontology (MCO-type-links): None | ||
17 | +--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json | ||
18 | +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ | ||
19 | +--outputFile Output of the mapping process: srr_htregulondb.tsv | ||
20 | +--minPerMatch Minimal string matching percentage: 80 | ||
21 | +--minCRFProbs Minimal crf probabilities allowed: 0.9 | ||
22 | + | ||
23 | + | ||
24 | + | ||
25 | + | ||
26 | + | ||
27 | +-------------------------------- INPUTS -------------------------------- | ||
28 | + | ||
29 | + | ||
30 | +npl tagged file | ||
31 | + | ||
32 | + SRR ... REPO_FILE | ||
33 | +0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
34 | +5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
35 | +7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
36 | + | ||
37 | +[3 rows x 15 columns] | ||
38 | + | ||
39 | +ontology entities | ||
40 | + | ||
41 | + TERM_ID TERM_NAME | ||
42 | +0 MCO000000014 generically dependent continuant | ||
43 | +1 MCO000000015 radiation | ||
44 | +2 MCO000000016 electromagnetic radiation | ||
45 | + | ||
46 | +additional ontology of synonyms (MCO-syn-json) | ||
47 | + | ||
48 | + ENTITY_NAME TERM_ID TERM_NAME | ||
49 | +MCO000000019 continuant MCO000000019 | ||
50 | +MCO000002475 culture medium MCO000002475 | ||
51 | +MCO000002467_0 Organism MCO000002467 biologicentity | ||
52 | + | ||
53 | + | ||
54 | +-------------------------------- RESULTS -------------------------------- | ||
55 | + | ||
56 | + | ||
57 | +Tracking exact terms to MCO... | ||
58 | + | ||
59 | +Mapping 4099 terms to MCO based on exact strings... | ||
60 | + | ||
61 | +Mapping 3770 terms to MCO - synonyms based on exact strings... | ||
62 | + | ||
63 | +Total of terms mapped by exact strings: 387 | ||
64 | +Saving filtered terms from raw mapping... | ||
65 | + | ||
66 | + | ||
67 | +3712 unmapped terms based on exact strings | ||
68 | +Dropping duplicated unmapped term names... | ||
69 | +206 unmapped unique terms based on exact strings | ||
70 | + | ||
71 | +compute string similarty... | ||
72 | + | ||
73 | +Mapping to MCO 206 terms based on string similarity... | ||
74 | + | ||
75 | +Mapping to MCO - synonyms 152 terms based on string siilarity.. | ||
76 | + | ||
77 | +Unique terms mapped by string similarity: 73 | ||
78 | +Total of terms mapped by string similarity: 1992 | ||
79 | +Saving filtered terms from str mapping... | ||
80 | + | ||
81 | + | ||
82 | +--------------------END---------------------- | ||
83 | +Total of terms mapped: 2379 | ||
84 | + | ||
85 | +Total of terms unmapped: 1720 |
This diff could not be displayed because it is too large.
mapping_MCO/output/v3.1/zika.json
0 → 100644
This diff could not be displayed because it is too large.
mapping_MCO/output/v3.1/zika_v3.json
0 → 100644
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
File moved
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
mapping_MCO/output/v4/curated/full_map_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv
0 → 100644
This diff could not be displayed because it is too large.
mapping_MCO/output/v4/curated/full_unmap_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv
0 → 100644
This diff could not be displayed because it is too large.
mapping_MCO/output/v4/curated/raw_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv
0 → 100644
This diff could not be displayed because it is too large.
mapping_MCO/output/v4/curated/sim_srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv
0 → 100644
This diff could not be displayed because it is too large.
mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_full.json
0 → 100644
This diff could not be displayed because it is too large.
mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_map.json
0 → 100644
This diff could not be displayed because it is too large.
mapping_MCO/output/v4/curated/srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped_unmap.json
0 → 100644
This diff could not be displayed because it is too large.
1 | +/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning | ||
2 | + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') | ||
3 | +/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v6.py:313: SettingWithCopyWarning: | ||
4 | +A value is trying to be set on a copy of a slice from a DataFrame. | ||
5 | +Try using .loc[row_indexer,col_indexer] = value instead | ||
6 | + | ||
7 | +See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy | ||
8 | + str_matches_odf["SOURCE"] = mco_ifile | ||
9 | + | ||
10 | + | ||
11 | +-------------------------------- PARAMETERS -------------------------------- | ||
12 | + | ||
13 | +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
14 | +--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv | ||
15 | +--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt | ||
16 | +--iLinksFile Input file with links and id for the ontology (MCO-type-links): None | ||
17 | +--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json | ||
18 | +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v4/curated/ | ||
19 | +--outputFile Output of the mapping process: srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv | ||
20 | +--minPerMatch Minimal string matching percentage: 80 | ||
21 | +--minCRFProbs Minimal crf probabilities allowed: 0.9 | ||
22 | + | ||
23 | + | ||
24 | + | ||
25 | + | ||
26 | + | ||
27 | +-------------------------------- INPUTS -------------------------------- | ||
28 | + | ||
29 | + | ||
30 | +npl tagged file | ||
31 | + | ||
32 | + SRR ... REPO_FILE | ||
33 | +0 SRR771533 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
34 | +2 SRR771534 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
35 | +24 SRR3194453 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
36 | + | ||
37 | +[3 rows x 14 columns] | ||
38 | + | ||
39 | +ontology entities | ||
40 | + | ||
41 | + TERM_ID TERM_NAME | ||
42 | +0 MCO000000014 generically dependent continuant | ||
43 | +1 MCO000000015 radiation | ||
44 | +2 MCO000000016 electromagnetic radiation | ||
45 | + | ||
46 | +additional ontology of synonyms (MCO-syn-json) | ||
47 | + | ||
48 | + ENTITY_NAME TERM_ID TERM_NAME | ||
49 | +MCO000000019 continuant MCO000000019 | ||
50 | +MCO000002475 culture medium MCO000002475 | ||
51 | +MCO000002467_0 Organism MCO000002467 biologicentity | ||
52 | + | ||
53 | + | ||
54 | +-------------------------------- RESULTS -------------------------------- | ||
55 | + | ||
56 | + | ||
57 | +Tracking exact terms to MCO... | ||
58 | + | ||
59 | +Mapping 2149 terms to MCO based on exact strings... | ||
60 | + | ||
61 | +Mapping 1820 terms to MCO - synonyms based on exact strings... | ||
62 | + | ||
63 | +Total of terms mapped by exact strings: 387 | ||
64 | +Saving filtered terms from raw mapping... | ||
65 | + | ||
66 | + | ||
67 | +1762 unmapped terms based on exact strings | ||
68 | +Dropping duplicated unmapped term names... | ||
69 | +104 unmapped unique terms based on exact strings | ||
70 | + | ||
71 | +compute string similarty... | ||
72 | + | ||
73 | +Mapping to MCO 104 terms based on string similarity... | ||
74 | + | ||
75 | +Mapping to MCO - synonyms 61 terms based on string siilarity.. | ||
76 | + | ||
77 | +Unique terms mapped by string similarity: 58 | ||
78 | +Total of terms mapped by string similarity: 1570 | ||
79 | +Saving filtered terms from str mapping... | ||
80 | + | ||
81 | + | ||
82 | +--------------------END---------------------- | ||
83 | +Total of terms mapped: 1957 | ||
84 | + | ||
85 | +Total of terms unmapped: 192 |
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
1 | +/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning | ||
2 | + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') | ||
3 | +/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning: | ||
4 | +A value is trying to be set on a copy of a slice from a DataFrame. | ||
5 | +Try using .loc[row_indexer,col_indexer] = value instead | ||
6 | + | ||
7 | +See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy | ||
8 | + str_matches_odf["SOURCE"] = mco_ifile | ||
9 | + | ||
10 | + | ||
11 | +-------------------------------- PARAMETERS -------------------------------- | ||
12 | + | ||
13 | +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
14 | +--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv | ||
15 | +--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt | ||
16 | +--iLinksFile Input file with links and id for the ontology (MCO-type-links): None | ||
17 | +--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json | ||
18 | +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v3/ | ||
19 | +--outputFile Output of the mapping process: srr_htregulondb.tsv | ||
20 | +--minPerMatch Minimal string matching percentage: 80 | ||
21 | +--minCRFProbs Minimal crf probabilities allowed: 0.9 | ||
22 | + | ||
23 | + | ||
24 | + | ||
25 | + | ||
26 | + | ||
27 | +-------------------------------- INPUTS -------------------------------- | ||
28 | + | ||
29 | + | ||
30 | +npl tagged file | ||
31 | + | ||
32 | + SRR ... REPO_FILE | ||
33 | +0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
34 | +5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
35 | +7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
36 | + | ||
37 | +[3 rows x 15 columns] | ||
38 | + | ||
39 | +ontology entities | ||
40 | + | ||
41 | + TERM_ID TERM_NAME | ||
42 | +0 MCO000000014 generically dependent continuant | ||
43 | +1 MCO000000015 radiation | ||
44 | +2 MCO000000016 electromagnetic radiation | ||
45 | + | ||
46 | +additional ontology of synonyms (MCO-syn-json) | ||
47 | + | ||
48 | + ENTITY_NAME TERM_ID TERM_NAME | ||
49 | +MCO000000019 continuant MCO000000019 | ||
50 | +MCO000002475 culture medium MCO000002475 | ||
51 | +MCO000002467_0 Organism MCO000002467 biologicentity | ||
52 | + | ||
53 | + | ||
54 | +-------------------------------- RESULTS -------------------------------- | ||
55 | + | ||
56 | + | ||
57 | +Tracking exact terms to MCO... | ||
58 | + | ||
59 | +Mapping 4099 terms to MCO based on exact strings... | ||
60 | + | ||
61 | +Mapping 3770 terms to MCO - synonyms based on exact strings... | ||
62 | + | ||
63 | +Total of terms mapped by exact strings: 387 | ||
64 | +Saving filtered terms from raw mapping... | ||
65 | + | ||
66 | + | ||
67 | +3712 unmapped terms based on exact strings | ||
68 | +Dropping duplicated unmapped term names... | ||
69 | +206 unmapped unique terms based on exact strings | ||
70 | + | ||
71 | +compute string similarty... | ||
72 | + | ||
73 | +Mapping to MCO 206 terms based on string similarity... | ||
74 | + | ||
75 | +Mapping to MCO - synonyms 152 terms based on string siilarity.. | ||
76 | + | ||
77 | +Unique terms mapped by string similarity: 73 | ||
78 | +Total of terms mapped by string similarity: 1992 | ||
79 | +Saving filtered terms from str mapping... | ||
80 | + | ||
81 | + | ||
82 | +--------------------END---------------------- | ||
83 | +Total of terms mapped: 2379 | ||
84 | + | ||
85 | +Total of terms unmapped: 1720 |
1 | +/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning | ||
2 | + warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning') | ||
3 | +/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v6.py:313: SettingWithCopyWarning: | ||
4 | +A value is trying to be set on a copy of a slice from a DataFrame. | ||
5 | +Try using .loc[row_indexer,col_indexer] = value instead | ||
6 | + | ||
7 | +See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy | ||
8 | + str_matches_odf["SOURCE"] = mco_ifile | ||
9 | + | ||
10 | + | ||
11 | +-------------------------------- PARAMETERS -------------------------------- | ||
12 | + | ||
13 | +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
14 | +--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv | ||
15 | +--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt | ||
16 | +--iLinksFile Input file with links and id for the ontology (MCO-type-links): None | ||
17 | +--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json | ||
18 | +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v4/ | ||
19 | +--outputFile Output of the mapping process: srr_htregulondb.tsv | ||
20 | +--minPerMatch Minimal string matching percentage: 80 | ||
21 | +--minCRFProbs Minimal crf probabilities allowed: 0.9 | ||
22 | + | ||
23 | + | ||
24 | + | ||
25 | + | ||
26 | + | ||
27 | +-------------------------------- INPUTS -------------------------------- | ||
28 | + | ||
29 | + | ||
30 | +npl tagged file | ||
31 | + | ||
32 | + SRR ... REPO_FILE | ||
33 | +0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
34 | +5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
35 | +7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex... | ||
36 | + | ||
37 | +[3 rows x 14 columns] | ||
38 | + | ||
39 | +ontology entities | ||
40 | + | ||
41 | + TERM_ID TERM_NAME | ||
42 | +0 MCO000000014 generically dependent continuant | ||
43 | +1 MCO000000015 radiation | ||
44 | +2 MCO000000016 electromagnetic radiation | ||
45 | + | ||
46 | +additional ontology of synonyms (MCO-syn-json) | ||
47 | + | ||
48 | + ENTITY_NAME TERM_ID TERM_NAME | ||
49 | +MCO000000019 continuant MCO000000019 | ||
50 | +MCO000002475 culture medium MCO000002475 | ||
51 | +MCO000002467_0 Organism MCO000002467 biologicentity | ||
52 | + | ||
53 | + | ||
54 | +-------------------------------- RESULTS -------------------------------- | ||
55 | + | ||
56 | + | ||
57 | +Tracking exact terms to MCO... | ||
58 | + | ||
59 | +Mapping 3769 terms to MCO based on exact strings... | ||
60 | + | ||
61 | +Mapping 3440 terms to MCO - synonyms based on exact strings... | ||
62 | + | ||
63 | +Total of terms mapped by exact strings: 387 | ||
64 | +Saving filtered terms from raw mapping... | ||
65 | + | ||
66 | + | ||
67 | +3382 unmapped terms based on exact strings | ||
68 | +Dropping duplicated unmapped term names... | ||
69 | +206 unmapped unique terms based on exact strings | ||
70 | + | ||
71 | +compute string similarty... | ||
72 | + | ||
73 | +Mapping to MCO 206 terms based on string similarity... | ||
74 | + | ||
75 | +Mapping to MCO - synonyms 152 terms based on string siilarity.. | ||
76 | + | ||
77 | +Unique terms mapped by string similarity: 73 | ||
78 | +Total of terms mapped by string similarity: 1668 | ||
79 | +Saving filtered terms from str mapping... | ||
80 | + | ||
81 | + | ||
82 | +--------------------END---------------------- | ||
83 | +Total of terms mapped: 2055 | ||
84 | + | ||
85 | +Total of terms unmapped: 1714 |
This diff could not be displayed because it is too large.
mapping_MCO/reports/zika_mapping_report.out
0 → 100644
1 | + | ||
2 | + | ||
3 | +-------------------------------- PARAMETERS -------------------------------- | ||
4 | + | ||
5 | +--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ | ||
6 | +--iAnnotatedFile Input file of npl tagged file: No_GSM_Metadata_Selected_v4.tsv | ||
7 | +--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ | ||
8 | +--outputFile Output of the mapping process: zika.json | ||
9 | + | ||
10 | + | ||
11 | + | ||
12 | +Total zika terms: 2351 |
-
Please register or login to post a comment