Estefani Gaytan Nunez

upload

Showing 56 changed files with 1597 additions and 3 deletions
......@@ -57,6 +57,12 @@ if (!length(opt)){
## Input files and output directories
infoFile <- opt$infoFile
if (!"gse" %in% names(gseInfo)){
stop("include at least gse column")
}
if (!"gsm" %in% names(gseInfo)){
gseInfor$gsm <- "GSM"
}
## Load main variables
......@@ -89,4 +95,4 @@ for (geoid in unique(gseInfo$gse)) {
}
cat("download id: ", length(list.dirs(opt$downloadPath, recursive = FALSE)))
message("Required GSE: ", ngse_down)
\ No newline at end of file
message("Required GSE: ", ngse_down)
......
......@@ -42,6 +42,11 @@ def get_crossref_info(info_df):
- **pmid**: PubMed ID
"""
def get_cite_info(info_df):
if(info_df.CASE_MATCH == "ZIKA"):
cite_dict ={
"evidence_id":"",
"evidence_name":"ZIKA",
"pmid" : info_df.PMID}
cite_dict ={
"evidence_id": "",
"evidence_name" : "NPL-CRF", #NPL
......@@ -49,6 +54,8 @@ def get_cite_info(info_df):
}
return(cite_dict)
def get_description(info_df):
if(info_df.CASE_MATCH=="ZIKA"):
mco_mapping = {}
if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
mco_mapping = {
"type": "term present on MCO"
......
from numpy import nan
from collections import OrderedDict
from pandas import DataFrame as DF
"""
- **name**: nombre del termino registrado en la MCO
- **term_id**: identificador del termino en RegulonDB (si existe)
- **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
- **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
- **source**: fuente de los datos [ GEO, ]
- **id**: identificador del registro de la base de datos o fuente de datos
- **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
- **associatedPhrase**: Frase de donde se tomo la informacion
"""
def get_term_info(info_df, source):
term_dict = {
"name": info_df.TERM_NAME, #NPL output
"term_id" : info_df.TERM_ID, #MCO
"term_type": info_df.TERM_TYPE, #NPL
"source_data": info_df.REPO_FILE, #NPL
"source": source,
"id": info_df.GSM, #NPL
"field": info_df.BANGLINE, #NPL
"associatedPhrase": info_df.FULL_TEXT #NPL
}
return(term_dict)
"""
- **objectId**: Identificador en la base de datos fuente
- **externalCrossReferences_name**: nombre de la DB [ GEO ]
"""
def get_crossref_info(info_df, source):
crossref_dict ={
"objectId": info_df.GSM, #NPL
"externalCrossReferences_name" : source
}
return(crossref_dict)
"""
- **evidence_id**: Identificador de RegulondB asociado a la evidencia
- **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
- **pmid**: PubMed ID
"""
def get_cite_info(info_df, esource):
cite_dict ={
"evidence_id": "",
"evidence_name" : esource,
"pmid": info_df.PMID
}
return(cite_dict)
def get_description(info_df, no_map=False):
if(no_map):
mco_mapping = {
"type": "not present on MCO"
}
elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
mco_mapping = {
"type": "term present on MCO"
}
else:
mco_mapping = {
"type": "string similarity",
"score": info_df.SET
}
return(mco_mapping)
"""
#run it in the main for each field
return: type
id: string
name: string
description: string
terms: list of dict
externalCrossReferences: list of dict
citations: list of dict
"""
def created_record(term_info_df, source = "GEO", no_map = False, esource = "NPL-CRF"):
record_dict = OrderedDict()
term_info_df = term_info_df.replace(nan, '', regex=True)
record_dict["id"] = term_info_df.TERM_ID #it should be add if it have been mapped
record_dict["name"] = term_info_df.TERM_NAME #a colum form NPL output
record_dict["description"] = [get_description(term_info_df, no_map=no_map)]
record_dict["terms"] = [get_term_info(term_info_df, source)]
record_dict["externalCrossReferences"] = [get_crossref_info(term_info_df, source)]
record_dict["citations"] = [get_cite_info(term_info_df, esource)]
return(record_dict)
def json2DataFrame(data):
mco_syn_dic = dict()
for j,i in enumerate(data):
if "regulondb_id" in i.keys():
if "synonyms" in i.keys():
for k,syn in enumerate(i['synonyms']):
dict_key = i['regulondb_id']+"_"+str(k)
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': syn.lower(),
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
elif "hasRelatedSynonyms" in i.keys():
for k,syn in enumerate(i['hasRelatedSynonyms']):
dict_key = i['regulondb_id']+"_"+str(k)
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': syn.lower(),
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
else:
dict_key = i['regulondb_id']
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': '',
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
mco_syn_df = DF.from_dict(mco_syn_dic).T
return(mco_syn_df)
from numpy import nan
#from collections import OrderedDict
from pandas import DataFrame as DF
import json
from collections import defaultdict
import format_fun_v6 as format_fun
def to_json(df, source_info, evidence_source, ofname):
df_terms = defaultdict(list)
for idx,row in df.iterrows():
term_record = format_fun.get_term_info(
row,
source = source_info,
map= row.MAP)
df_terms[row.SRR].append(term_record)
df_json = {}
df_tmp = df.drop_duplicates("SRR", keep="first")
for idx,row in df_tmp.iterrows():
srr_record = format_fun.created_record(
info_df = row,
term_list = df_terms[row.SRR],
source = source_info,
esource = evidence_source)
df_json[row.SRR] = srr_record
with open(ofname, "w") as output:
json.dump(df_json, output, separators=(',', ':'), indent=4)
def get_score(info_df):
if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
subtext = "term present on MCO"
else:
mco_mapping = {
"type": "string similarity",
"score": info_df.SET
}
return(mco_mapping)
"""
- **name**: nombre del termino registrado en la MCO
- **term_id**: identificador del termino en RegulonDB (si existe)
- **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
- **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
- **source**: fuente de los datos [ GEO, ]
- **id**: identificador del registro de la base de datos o fuente de datos
- **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
- **associatedPhrase**: Frase de donde se tomo la informacion
"""
def get_term_info(info_df, source, map=True):
info_df = info_df.replace(nan, "", regex=True)
term_dict = {
"name": info_df.TERM_NAME, #NPL output
"term_id" : info_df.TERM_ID, #MCO
"term_type": info_df.TERM_TYPE, #NPL
"source_data": {
"source": source,
"id": info_df.GSM, #NPL
"field": info_df.BANGLINE, #NPL
"associatedPhrase": info_df.FULL_TEXT, #NPL
"description" : get_description(info_df, map),
"similarity_percentage" : info_df.SET
}
}
return(term_dict)
"""
- **objectId**: Identificador en la base de datos fuente
- **externalCrossReferences_name**: nombre de la DB [ GEO ]
"""
def get_crossref_info(info_df, source):
crossref_dict ={
"objectId": info_df.GSM, #NPL
"externalCrossReferences_name" : source
}
return(crossref_dict)
"""
- **evidence_id**: Identificador de RegulondB asociado a la evidencia
- **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
- **pmid**: PubMed ID
"""
def get_cite_info(info_df, esource):
cite_dict ={
"evidence_id": "",
"evidence_name" : esource,
"pmid": info_df.PMID
}
return(cite_dict)
def get_description(info_df, map=True):
if not map:
subtext = "absent in RegulonDB MCO"
elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
subtext = "RegulonDB MCO term"
else:
subtext = "Similar term in RegulonDB MCO"
return(subtext)
"""
#run it in the main for each field
return: type
id: string
name: string
description: string
terms: list of dict
externalCrossReferences: list of dict
citations: list of dict
"""
def created_record(info_df, term_list, source = "GEO", esource = "NPL-CRF"):
#record_dict = OrderedDict()
record_dict = {}
info_df = info_df.replace(nan, "", regex=True)
record_dict["id"] = ""
record_dict["name"] = ""
record_dict["description"] = ""
record_dict["terms"] = term_list
record_dict["externalCrossReferences"] = [get_crossref_info(info_df, source)]
record_dict["citations"] = [get_cite_info(info_df, esource)]
return(record_dict)
def json2DataFrame(data):
mco_syn_dic = dict()
for j,i in enumerate(data):
if "regulondb_id" in i.keys():
if "synonyms" in i.keys():
for k,syn in enumerate(i['synonyms']):
dict_key = i['regulondb_id']+"_"+str(k)
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': syn.lower(),
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
elif "hasRelatedSynonyms" in i.keys():
for k,syn in enumerate(i['hasRelatedSynonyms']):
dict_key = i['regulondb_id']+"_"+str(k)
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': syn.lower(),
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
else:
dict_key = i['regulondb_id']
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': '',
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
mco_syn_df = DF.from_dict(mco_syn_dic).T
return(mco_syn_df)
# -*- coding: utf-8 -*-
"""
#Setup
"""
#################### Setup ####################
from collections import defaultdict
from optparse import OptionParser
import os
from numpy.core.fromnumeric import sort
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import exp, nan
import seaborn as sns
from numpy import mean
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
# %matplotlib inline
from collections import Counter
import json
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import format_fun
import mapping_fun
import sys
"""
# input parameters
--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--iOntoFile gc_ontology_terms_v2.txt
--iLinksFile gc_ontology_terms_link_v2.txt
--iSynFile mco_terms_v0.2.json
--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile all_srr_IV_mapped.tsv
--minPerMatch 90
#Example
# nohup python3 mapping2MCO_v3.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile srr_htregulondb_mapped.tsv --minPerMatch 80 --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
"""
#################### Defining parameters ####################
if __name__ == "__main__":
parser = OptionParser()
parser.add_option(
"--inputPath",
dest="input_path",
help="Path of npl tagged file (crf output)",
metavar="PATH")
parser.add_option(
"--iAnnotatedFile",
dest="npl_fname",
help="Input file of npl tagged file (crf output)",
metavar="FILE",
default="")
parser.add_option(
"--iOntoFile",
dest="onto_fname",
help="Input file with the ontology entities",
metavar="FILE",
default="")
parser.add_option(
"--iLinksFile",
dest="links_fname",
help="Input file with links and id for the ontology",
metavar="FILE",
default=None)
parser.add_option(
"--iSynFile",
dest="syn_fname",
help="Input file for the additional ontology of synonyms",
metavar="FILE",
default=None)
parser.add_option(
"--outputPath",
dest="output_path",
help="Output path to place output files",
metavar="PATH")
parser.add_option(
"--outputFile",
dest="out_fname",
help="Output file name for the mapping process",
metavar="FILE",
default="")
parser.add_option(
"--minPerMatch",
dest="min_score",
help="Minimal string matching percentage")
parser.add_option(
"--minCRFProbs",
dest="min_probs",
help="Minimal crf probabilities")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
#################### DISP PARAMETERS ####################
print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
print("--inputPath Path of npl tagged file: " + str(options.input_path))
print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
print("--outputPath Output path to place output files: " + str(options.output_path))
print("--outputFile Output of the mapping process: " + str(options.out_fname))
print("--minPerMatch Minimal string matching percentage: " + str(options.min_score))
print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs))
print("\n\n")
repognrl = "http://pakal.ccg.unam.mx/cmendezc"
reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
repo_url = '/'.join([repognrl,reponame])
# Input files
min_score = int(options.min_score)
min_probs = float(options.min_probs)
npl_ifile = os.path.join(options.input_path, options.npl_fname)
mco_ifile = os.path.join(options.input_path, options.onto_fname)
mco_syn_ifile = os.path.join(options.input_path, options.syn_fname)
#Output files
raw_ofname = "_".join(["raw", options.out_fname])
rawmap_ofile = os.path.join(options.output_path, raw_ofname)
str_ofname = "_".join(["sim", options.out_fname])
strmap_ofile = os.path.join(options.output_path, str_ofname)
full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
json_ofile = os.path.join(options.output_path, options.out_fname)
json_ofile_map = json_ofile.replace(".tsv", "_map.json")
json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
json_ofile_list= json_ofile.replace(".tsv", "_list.json")
json_ofile_df_list= json_ofile.replace(".tsv", "_df_list.json")
#################### Load input data ####################
# Load CRF-annotation
exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
npl_full = read_table(npl_ifile, sep = "\t")
obs_cols = set(npl_full.columns)
if exp_cols.intersection(obs_cols) != exp_cols:
ocol = ", ".join(list(exp_cols))
sys.exit(ocol + " expected columns for iAnnotatedFile" )
#Load MCO term names
exp_cols = {"TERM_ID", "TERM_NAME"}
mco_df_full = read_table(mco_ifile, sep = "\t")
obs_cols = set(mco_df_full.columns)
if exp_cols.intersection(obs_cols) != exp_cols:
sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
mco_df = mco_df.drop_duplicates(keep="first")
mco_df = mco_df.dropna()
#Load MCO links
if options.links_fname is not None:
print("\nLoad types...")
mcolink_ifile = os.path.join(options.input_path, options.links_fname)
exp_cols = {"TERM_ID", "TERM_TYPE"}
mco_links_full = read_table(mcolink_ifile, sep = "\t")
obs_cols = set(mco_links_full.columns)
if exp_cols.intersection(obs_cols) != exp_cols:
sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
mco_links = mco_links.drop_duplicates(keep="first")
mco_links = mco_links.dropna()
else:
mco_links = None
#Load MCO terms synonyms
#format json from mco to dataframe
mco_json = open(mco_syn_ifile )
data = json.load(mco_json)
mco_syn = format_fun.json2DataFrame(data)
df_json = defaultdict(list)
for idx,row in npl_full.iterrows():
record = format_fun.created_record(row)
df_json[row.SRR].append(record)
df_json
with open(json_ofile_list, "w") as output:
json.dump(format_fun.created_record(df_json), output)
with open(json_ofile_df_list, "a") as output:
for idx,row in df_json.items():
json.dump(format_fun.created_record(row), output)
\ No newline at end of file
# -*- coding: utf-8 -*-
"""
#Setup
"""
#################### Setup ####################
from collections import defaultdict
from optparse import OptionParser
import os
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import exp, nan, mean
import json
import format_fun_v4 as format_fun
import sys
"""
# input parameters
--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile all_srr_IV_mapped.tsv
#Example
# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v4.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v3.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile zika_mapped.json
"""
#################### Defining parameters ####################
if __name__ == "__main__":
parser = OptionParser()
parser.add_option(
"--inputPath",
dest="input_path",
help="Path of npl tagged file (crf output)",
metavar="PATH")
parser.add_option(
"--iAnnotatedFile",
dest="npl_fname",
help="Input file of npl tagged file (crf output)",
metavar="FILE",
default="")
parser.add_option(
"--outputPath",
dest="output_path",
help="Output path to place output files",
metavar="PATH")
parser.add_option(
"--outputFile",
dest="out_fname",
help="Output file name for the mapping process",
metavar="FILE",
default="")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
#################### DISP PARAMETERS ####################
print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
print("--inputPath Path of npl tagged file: " + str(options.input_path))
print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
print("--outputPath Output path to place output files: " + str(options.output_path))
print("--outputFile Output of the mapping process: " + str(options.out_fname))
print("\n\n")
# Input files
npl_ifile = os.path.join(options.input_path, options.npl_fname)
#Output files
ofname = os.path.join(options.output_path, options.out_fname)
#################### Load input data ####################
# Load CRF-annotation
exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
npl_full = read_table(npl_ifile, sep = "\t")
obs_cols = set(npl_full.columns)
if exp_cols.intersection(obs_cols) != exp_cols:
ocol = ", ".join(list(exp_cols))
sys.exit(ocol + " expected columns for iAnnotatedFile" )
df_json = defaultdict(list)
for idx,row in npl_full.iterrows():
record = format_fun.created_record(row, source = "ZIKAdb", no_map = True, esource = "database")
if(idx<2): print(record)
#record_json = json.dumps(record)
record_json = record
df_json[row.SRR].append(record_json)
"""
with open(ofname, "a") as output:
output.write("field:[")
sep=""
for k,v in df_json.items():
output.write(sep)
json.dump(v, output)
sep=","
output.write("]")
"""
with open(ofname, "a") as output:
output.write("{")
sep=""
for k,v in df_json.items():
output.write(sep)
output.write("\""+k+"\"")
output.write(":")
record_list = {
"growth_conditions": df_json[k]
}
json.dump(record_list, output)
sep=","
output.write("}")
df=open(ofname)
df=json.load(df)
# -*- coding: utf-8 -*-
"""
#Setup
"""
#################### Setup ####################
from optparse import OptionParser
import os
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import mean
import format_fun_v6 as format_fun
import sys
"""
# input parameters
--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile all_srr_IV_mapped.tsv
#Example
# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/ --outputFile zika.json > automatic-extraction-growth-conditions/mapping_MCO/reports/zika_formated_report.out
# python3 /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/format_zika_v5.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile No_GSM_Metadata_Selected_v4.tsv --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/test/ --outputFile zika_mapped.json > automatic-extraction-growth-conditions/mapping_MCO/test/zika_mapping_report.out
"""
#################### Defining parameters ####################
if __name__ == "__main__":
parser = OptionParser()
parser.add_option(
"--inputPath",
dest="input_path",
help="Path of npl tagged file (crf output)",
metavar="PATH")
parser.add_option(
"--iAnnotatedFile",
dest="npl_fname",
help="Input file of npl tagged file (crf output)",
metavar="FILE",
default="")
parser.add_option(
"--outputPath",
dest="output_path",
help="Output path to place output files",
metavar="PATH")
parser.add_option(
"--outputFile",
dest="out_fname",
help="Output file name for the mapping process",
metavar="FILE",
default="")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
#################### DISP PARAMETERS ####################
print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
print("--inputPath Path of npl tagged file: " + str(options.input_path))
print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
print("--outputPath Output path to place output files: " + str(options.output_path))
print("--outputFile Output of the mapping process: " + str(options.out_fname))
print("\n\n")
# Input files
npl_ifile = os.path.join(options.input_path, options.npl_fname)
#Output files
ofname = os.path.join(options.output_path, options.out_fname)
#################### Load input data ####################
# Load CRF-annotation
exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
npl_full = read_table(npl_ifile, sep = "\t")
npl_full = npl_full.drop_duplicates()
print(f"Total zika terms: {len(npl_full)} ")
obs_cols = set(npl_full.columns)
if exp_cols.intersection(obs_cols) != exp_cols:
ocol = ", ".join(list(exp_cols))
sys.exit(ocol + " expected columns for iAnnotatedFile" )
"""
df_terms = defaultdict(list)
for idx,row in npl_full.iterrows():
term_record = format_fun.get_term_info(row, source = "ZIKAdb", map=False)
df_terms[row.SRR].append(term_record)
df_json = {}
df_tmp = npl_full.drop_duplicates("SRR", keep="first")
for idx,row in df_tmp.iterrows():
srr_record = format_fun.created_record(
info_df = row,
term_list = df_terms[row.SRR],
source = "ZIKAdb",
esource = "database")
df_json[row.SRR] = srr_record
with open(ofname, "w") as output:
json.dump(df_json, output, separators=(',', ':'), indent=4)
df=open(ofname)
df=json.load(df)
print(df["ERR1399578"])
"""
npl_full["MAP"] = False
format_fun.to_json(
df = npl_full,
source_info = "ZIKAdb",
evidence_source = "database",
ofname = ofname)
\ No newline at end of file
......@@ -23,7 +23,8 @@ import json
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import format_fun
#import format_fun
import format_fun_v4 as format_fun
import mapping_fun
import sys
......@@ -338,5 +339,6 @@ if __name__ == "__main__":
with open(json_ofile_unmap, "a") as output:
for idx,row in full_unmap.iterrows():
json.dump(format_fun.created_record(row), output)
\ No newline at end of file
......
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
This diff is collapsed. Click to expand it.
from pandas import read_csv, merge
crf_output_file = "/home/egaytan/automatic-extraction-growth-conditions/predict-annot/output/srr_htregulondb/srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv"
annot_file = "/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/correct_gc_terms_07_rev_Victor.csv"
filter_ofile = "/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv"
annot = read_csv(annot_file, names = ["TERM_TYPE", "TERM_NAME"] )
annot.TERM_NAME = [text.strip() for text in annot.TERM_NAME]
crf_ouput = read_csv(crf_output_file,
names = ["SRR","GSE","GSM","GPL","PMID","GSM_NAME",
"GSE_NAME","GPL_NAME","BANGLINE",
"SOURCE_TEXT_CTRL","FULL_TEXT","TERM_NAME",
"TERM_TYPE","PROB"],
sep = "\t")
crf_ouput.TERM_NAME = [text.strip() for text in crf_ouput.TERM_NAME]
paso1 = merge( annot,crf_ouput, on = ["TERM_TYPE", "TERM_NAME"] )
paso1 = paso1.reindex(columns = ["SRR","GSE","GSM","GPL","PMID","GSM_NAME",
"GSE_NAME","GPL_NAME","BANGLINE",
"SOURCE_TEXT_CTRL","FULL_TEXT","TERM_NAME",
"TERM_TYPE","PROB"])
paso1.to_csv(filter_ofile,sep="\t",index=False,header=True)
This diff could not be displayed because it is too large.
GC_type,GC_term
Agit,250 rpm
Air,Aerobic
Air,Aerobic and anaerobic
Air,aerobically
Air,anaerobic
Gtype,{ delta } baeR
Gtype,{ delta } cpxR
Gtype,{ delta } cspABCEG
Gtype,{ delta } cspABEG
Gtype,{ delta } cspBG
Gtype,{ delta } hns
Gtype,{ delta } kdpE
Gtype,{ delta } nusG
Gtype,{ delta } perC : : kanR
Gtype,{ delta } phoB
Gtype,{ delta } rho
Gtype,{ delta } rnr
Gtype,{ delta } zraR
Gtype,: φO104
Gtype,DH5α ( pAR060302 )
Gtype,E.coli K12 BW25113
Gtype,K12 MG1655
Gtype,K12 MG1655 deltaprfC
Gtype,K12 MG1655 prfB-Bstrain allele
Gtype,K12 MG1655 prfB-Bstrain allele deltaprfC
Gtype,K12 MG1657
Gtype,K12 MG1667
Gtype,K12 MG1668
Gtype,K12 MG1672
Gtype,K12 MG1673
Gtype,K12 MG1674
Gtype,K12 W3110
Gtype,MC4100 ∆ tig : : kan pTig-TEV-Avi
Gtype,O157 : H7 NCTC 12900
Gtype,PNPase mutant
Gtype,Pck over-expressed
Gtype,RNase II mutant
Gtype,RNase R mutant
Gtype,W3110 6xHis-rpoD
Gtype,W3110 6xHis-rpoD greA : : tet greB : : amp
Gtype,"W3110 rpoC-6xHis : : kan greA : : tet , greB : : amp"
Gtype,WT
Gtype,WT WT
Gtype,Wild type
Gtype,Wild-type
Gtype,Wildtype
Gtype,cra KO ; BW25113 Dcra
Gtype,cya mutant background
Gtype,delta Crp
Gtype,delta _ cra
Gtype,delta-gadE
Gtype,delta-gadW
Gtype,delta-gadX
Gtype,delta-oxyR
Gtype,delta-soxR
Gtype,delta-soxS
Gtype,fepA knockout
Gtype,fis mutant background
Gtype,lacA knockout
Gtype,lack Fis protein
Gtype,lack H-NS protein
Gtype,naive ( wild type )
Gtype,ompR deletion mutant
Gtype,phageO104 in the wrbA gene
Gtype,phagePA8 in the argW gene
Gtype,rng mutant
Gtype,sdhC knockout
Gtype,sigma70 WT
Gtype,wild type
Gtype,wild type ; MG1655
Gtype,wild-type
Gtype,wildtype
Gtype,wt
Gtype,yafC deletion
Gtype,ybaO deletion
Gtype,ybaQ deletion
Gtype,ybiH deletion
Gtype,ydcI deletion
Gtype,yddM deletion
Gtype,yeiE deletion
Gtype,yheO deletion
Gtype,yiaJ deletion
Gtype,yieP deletion
Gtype,Δcra
Gtype,Δfur
Gtype,ΔgadE
Gtype,ΔgadW
Gtype,ΔgadX
Gtype,ΔoxyR
Gtype,ΔsoxR
Gtype,ΔsoxS
Gtype,∆ cspABCEG
Gtype,∆ cspABEG
Gtype,∆ cspBG
Gtype,∆ hfq : : cat )
Gtype,∆ rnr
Med,Bertani ( LB ) medium
Med,Davis Minimal medium
Med,LB
Med,LB media
Med,LB medium
Med,"LB medium ,"
Med,M9 + 4 g/L glc ( glucose minimal media )
Med,M9 minimal media
Med,M9 minimal medium
Med,MOPS complete-glucose liquid media
Med,MOPS glucose minimal medium
Med,MOPS medium
Med,Neidhardt MOPS Minimal Medium ( NM3 )
Med,SB medium
Med,SILAC
Med,W2 minimal media
Med,fresh DM500
Med,fully supplemented MOPS glucose media
Med,glucose-M9 minimal media
Med,glucose-limited minimal medium
Med,in fresh LB medium
Med,minimal medium
OD,O.D. 600nm 0.5
OD,OD600 = 0.3
OD,OD600 of about 0.8
Phase,IspG1 strain
Phase,exponential
Phase,log phase
Phase,log phase sample
Phase,mid-log phase
Phase,stationary
Phase,stationary phase
Supp,0.1 mM KCl
Supp,0.2 % arabinose
Supp,0.2 % glucose
Supp,0.2 % glutamine
Supp,0.2 mM of DPD
Supp,0.3 % glucose
Supp,0.3 M of NaCl
Supp,0.4 % glucose
Supp,0.5 % glucose
Supp,100 μM IPTG
Supp,1mM IPTG
Supp,2 mM Hydrogen peroxide
Supp,22 mM glucose
Supp,250 uM of paraquat
Supp,2g/L glucose
Supp,2g/L glucose and 1 mM cytidine
Supp,4g/L glucose
Supp,50 µM NiCl2
Supp,70 µM IPTG
Supp,DPD
Supp,Fe
Supp,IPTG
Supp,IPTG was
Supp,L-trp
Supp,Xgal and IPTG
Supp,acetate
Supp,ade
Supp,arabinose
Supp,fructose
Supp,glucose
Supp,glutamine
Supp,induced 50 µM IPTG
Supp,mM IPTG
Supp,mM IPTG + 50μg/ml Amp
Supp,rhamnose
Supp,rifampicin
Supp,rifampicin and
Supp,rifampicin time point
Supp,rifampicin time point 0
Supp,rifampicin time point 4
Supp,rifampicin time point 6
Supp,rifampicin time point 8
Temp,10 °C
Temp,30 °C
Temp,37 °C
Temp,37 ℃
Temp,42 °C
pH,pH 5.5
pH,pH5 .5
# -*- coding: utf-8 -*-
"""
#Setup
"""
#################### Setup ####################
from collections import defaultdict
from optparse import OptionParser
import os
from numpy.core.fromnumeric import sort
from pandas import read_csv, DataFrame, merge, concat, read_table
from numpy import exp, nan
import seaborn as sns
from numpy import mean
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot')
# %matplotlib inline
from collections import Counter
import json
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import format_fun
import mapping_fun
import sys
"""
# input parameters
--inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile srr_IV_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10_named.tsv
--iOntoFile gc_ontology_terms_v2.txt
--iLinksFile gc_ontology_terms_link_v2.txt
--iSynFile mco_terms_v0.2.json
--outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/
--outputFile all_srr_IV_mapped.tsv
--minPerMatch 90
#Example
# nohup python3 mapping2MCO_v3.py --inputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/ --iAnnotatedFile srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv --iOntoFile gc_ontology_terms_v2.txt --iSynFile mco_terms_v0.2.json --outputPath /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/ --outputFile srr_htregulondb_mapped.tsv --minPerMatch 80 --minCRFProbs 0.9 > ../reports/srr_htregulondb_mapping_report.out &
"""
#################### Defining parameters ####################
if __name__ == "__main__":
parser = OptionParser()
parser.add_option(
"--inputPath",
dest="input_path",
help="Path of npl tagged file (crf output)",
metavar="PATH")
parser.add_option(
"--iAnnotatedFile",
dest="npl_fname",
help="Input file of npl tagged file (crf output)",
metavar="FILE",
default="")
parser.add_option(
"--iOntoFile",
dest="onto_fname",
help="Input file with the ontology entities",
metavar="FILE",
default="")
parser.add_option(
"--iLinksFile",
dest="links_fname",
help="Input file with links and id for the ontology",
metavar="FILE",
default=None)
parser.add_option(
"--iSynFile",
dest="syn_fname",
help="Input file for the additional ontology of synonyms",
metavar="FILE",
default=None)
parser.add_option(
"--outputPath",
dest="output_path",
help="Output path to place output files",
metavar="PATH")
parser.add_option(
"--outputFile",
dest="out_fname",
help="Output file name for the mapping process",
metavar="FILE",
default="")
parser.add_option(
"--minPerMatch",
dest="min_score",
help="Minimal string matching percentage")
parser.add_option(
"--minCRFProbs",
dest="min_probs",
help="Minimal crf probabilities")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
#################### DISP PARAMETERS ####################
print('\n\n-------------------------------- PARAMETERS --------------------------------\n')
print("--inputPath Path of npl tagged file: " + str(options.input_path))
print("--iAnnotatedFile Input file of npl tagged file: " + str(options.npl_fname))
print("--iOntoFile Input file with the ontology entities (MCO-terms): " + str(options.onto_fname))
print("--iLinksFile Input file with links and id for the ontology (MCO-type-links): " + str(options.links_fname))
print("--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): " + str(options.syn_fname))
print("--outputPath Output path to place output files: " + str(options.output_path))
print("--outputFile Output of the mapping process: " + str(options.out_fname))
print("--minPerMatch Minimal string matching percentage: " + str(options.min_score))
print("--minCRFProbs Minimal crf probabilities allowed: " + str(options.min_probs))
print("\n\n")
repognrl = "http://pakal.ccg.unam.mx/cmendezc"
reponame = "automatic-extraction-growth-conditions/tree/master/extraction-geo/download/srr_htregulondb"
repo_url = '/'.join([repognrl,reponame])
# Input files
min_score = int(options.min_score)
min_probs = float(options.min_probs)
npl_ifile = os.path.join(options.input_path, options.npl_fname)
mco_ifile = os.path.join(options.input_path, options.onto_fname)
mco_syn_ifile = os.path.join(options.input_path, options.syn_fname)
#Output files
raw_ofname = "_".join(["raw", options.out_fname])
rawmap_ofile = os.path.join(options.output_path, raw_ofname)
str_ofname = "_".join(["sim", options.out_fname])
strmap_ofile = os.path.join(options.output_path, str_ofname)
full_ofile = os.path.join(options.output_path, "full_"+options.out_fname)
full_unmap_ofile = os.path.join(options.output_path, "full_unmap_"+options.out_fname)
json_ofile = os.path.join(options.output_path, options.out_fname)
json_ofile_map = json_ofile.replace(".tsv", "_map.json")
json_ofile_unmap= json_ofile.replace(".tsv", "_unmap.json")
#################### Load input data ####################
# Load CRF-annotation
exp_cols = {"SRR", "GSE", "GSM", "GPL", "PMID", "FULL_TEXT", "BANGLINE", "TERM_NAME", "TERM_TYPE", "PROB"}
npl_full = read_table(npl_ifile, sep = "\t")
obs_cols = set(npl_full.columns)
if exp_cols.intersection(obs_cols) != exp_cols:
ocol = ", ".join(list(exp_cols))
sys.exit(ocol + " expected columns for iAnnotatedFile" )
#Load MCO term names
exp_cols = {"TERM_ID", "TERM_NAME"}
mco_df_full = read_table(mco_ifile, sep = "\t")
obs_cols = set(mco_df_full.columns)
if exp_cols.intersection(obs_cols) != exp_cols:
sys.exit("\"TERM_ID\" and \"TERM_NAME\" expected columns for iOntoFile" )
mco_df = mco_df_full[["TERM_ID","TERM_NAME"]]
mco_df = mco_df.drop_duplicates(keep="first")
mco_df = mco_df.dropna()
#Load MCO links
if options.links_fname is not None:
print("\nLoad types...")
mcolink_ifile = os.path.join(options.input_path, options.links_fname)
exp_cols = {"TERM_ID", "TERM_TYPE"}
mco_links_full = read_table(mcolink_ifile, sep = "\t")
obs_cols = set(mco_links_full.columns)
if exp_cols.intersection(obs_cols) != exp_cols:
sys.exit("at least \"TERM_ID\" and \"TERM_TYPE\" expected columns for iLinksFile" )
mco_links = mco_links_full[["TERM_ID", "TERM_TYPE"]]
mco_links = mco_links.drop_duplicates(keep="first")
mco_links = mco_links.dropna()
else:
mco_links = None
#Load MCO terms synonyms
#format json from mco to dataframe
mco_json = open(mco_syn_ifile )
data = json.load(mco_json)
mco_syn = format_fun.json2DataFrame(data)
df_json = defaultdict(list)
for idx,row in full_unmap.iterrows():
record = format_fun.created_record(row), output)
df_json[row.SRR].append(record)
df_json
with open(json_ofile_list, "w") as output:
json.dump(format_fun.created_record(df_json), output)
with open(json_ofile_df_list, "a") as output:
for idx,row in df_json.items():
json.dump(format_fun.created_record(row), output)
\ No newline at end of file
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
str_matches_odf["SOURCE"] = mco_ifile
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
--iLinksFile Input file with links and id for the ontology (MCO-type-links): None
--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
--outputFile Output of the mapping process: srr_htregulondb.tsv
--minPerMatch Minimal string matching percentage: 80
--minCRFProbs Minimal crf probabilities allowed: 0.9
-------------------------------- INPUTS --------------------------------
npl tagged file
SRR ... REPO_FILE
0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
[3 rows x 15 columns]
ontology entities
TERM_ID TERM_NAME
0 MCO000000014 generically dependent continuant
1 MCO000000015 radiation
2 MCO000000016 electromagnetic radiation
additional ontology of synonyms (MCO-syn-json)
ENTITY_NAME TERM_ID TERM_NAME
MCO000000019 continuant MCO000000019
MCO000002475 culture medium MCO000002475
MCO000002467_0 Organism MCO000002467 biologicentity
-------------------------------- RESULTS --------------------------------
Tracking exact terms to MCO...
Mapping 4099 terms to MCO based on exact strings...
Mapping 3770 terms to MCO - synonyms based on exact strings...
Total of terms mapped by exact strings: 387
Saving filtered terms from raw mapping...
3712 unmapped terms based on exact strings
Dropping duplicated unmapped term names...
206 unmapped unique terms based on exact strings
compute string similarty...
Mapping to MCO 206 terms based on string similarity...
Mapping to MCO - synonyms 152 terms based on string siilarity..
Unique terms mapped by string similarity: 73
Total of terms mapped by string similarity: 1992
Saving filtered terms from str mapping...
--------------------END----------------------
Total of terms mapped: 2379
Total of terms unmapped: 1720
/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
str_matches_odf["SOURCE"] = mco_ifile
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
--iLinksFile Input file with links and id for the ontology (MCO-type-links): None
--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
--outputFile Output of the mapping process: srr_htregulondb.tsv
--minPerMatch Minimal string matching percentage: 80
--minCRFProbs Minimal crf probabilities allowed: 0.9
-------------------------------- INPUTS --------------------------------
npl tagged file
SRR ... REPO_FILE
0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
[3 rows x 15 columns]
ontology entities
TERM_ID TERM_NAME
0 MCO000000014 generically dependent continuant
1 MCO000000015 radiation
2 MCO000000016 electromagnetic radiation
additional ontology of synonyms (MCO-syn-json)
ENTITY_NAME TERM_ID TERM_NAME
MCO000000019 continuant MCO000000019
MCO000002475 culture medium MCO000002475
MCO000002467_0 Organism MCO000002467 biologicentity
-------------------------------- RESULTS --------------------------------
Tracking exact terms to MCO...
Mapping 4099 terms to MCO based on exact strings...
Mapping 3770 terms to MCO - synonyms based on exact strings...
Total of terms mapped by exact strings: 387
Saving filtered terms from raw mapping...
3712 unmapped terms based on exact strings
Dropping duplicated unmapped term names...
206 unmapped unique terms based on exact strings
compute string similarty...
Mapping to MCO 206 terms based on string similarity...
Mapping to MCO - synonyms 152 terms based on string siilarity..
Unique terms mapped by string similarity: 73
Total of terms mapped by string similarity: 1992
Saving filtered terms from str mapping...
--------------------END----------------------
Total of terms mapped: 2379
Total of terms unmapped: 1720
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v6.py:313: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
str_matches_odf["SOURCE"] = mco_ifile
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_correct_gc_terms_07_rev_Victor.tsv
--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
--iLinksFile Input file with links and id for the ontology (MCO-type-links): None
--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v4/curated/
--outputFile Output of the mapping process: srr_htregulondb_correct_gc_terms_07_rev_Victor_mapped.tsv
--minPerMatch Minimal string matching percentage: 80
--minCRFProbs Minimal crf probabilities allowed: 0.9
-------------------------------- INPUTS --------------------------------
npl tagged file
SRR ... REPO_FILE
0 SRR771533 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
2 SRR771534 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
24 SRR3194453 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
[3 rows x 14 columns]
ontology entities
TERM_ID TERM_NAME
0 MCO000000014 generically dependent continuant
1 MCO000000015 radiation
2 MCO000000016 electromagnetic radiation
additional ontology of synonyms (MCO-syn-json)
ENTITY_NAME TERM_ID TERM_NAME
MCO000000019 continuant MCO000000019
MCO000002475 culture medium MCO000002475
MCO000002467_0 Organism MCO000002467 biologicentity
-------------------------------- RESULTS --------------------------------
Tracking exact terms to MCO...
Mapping 2149 terms to MCO based on exact strings...
Mapping 1820 terms to MCO - synonyms based on exact strings...
Total of terms mapped by exact strings: 387
Saving filtered terms from raw mapping...
1762 unmapped terms based on exact strings
Dropping duplicated unmapped term names...
104 unmapped unique terms based on exact strings
compute string similarty...
Mapping to MCO 104 terms based on string similarity...
Mapping to MCO - synonyms 61 terms based on string siilarity..
Unique terms mapped by string similarity: 58
Total of terms mapped by string similarity: 1570
Saving filtered terms from str mapping...
--------------------END----------------------
Total of terms mapped: 1957
Total of terms unmapped: 192
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v5.py:312: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
str_matches_odf["SOURCE"] = mco_ifile
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
--iLinksFile Input file with links and id for the ontology (MCO-type-links): None
--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v3/
--outputFile Output of the mapping process: srr_htregulondb.tsv
--minPerMatch Minimal string matching percentage: 80
--minCRFProbs Minimal crf probabilities allowed: 0.9
-------------------------------- INPUTS --------------------------------
npl tagged file
SRR ... REPO_FILE
0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
[3 rows x 15 columns]
ontology entities
TERM_ID TERM_NAME
0 MCO000000014 generically dependent continuant
1 MCO000000015 radiation
2 MCO000000016 electromagnetic radiation
additional ontology of synonyms (MCO-syn-json)
ENTITY_NAME TERM_ID TERM_NAME
MCO000000019 continuant MCO000000019
MCO000002475 culture medium MCO000002475
MCO000002467_0 Organism MCO000002467 biologicentity
-------------------------------- RESULTS --------------------------------
Tracking exact terms to MCO...
Mapping 4099 terms to MCO based on exact strings...
Mapping 3770 terms to MCO - synonyms based on exact strings...
Total of terms mapped by exact strings: 387
Saving filtered terms from raw mapping...
3712 unmapped terms based on exact strings
Dropping duplicated unmapped term names...
206 unmapped unique terms based on exact strings
compute string similarty...
Mapping to MCO 206 terms based on string similarity...
Mapping to MCO - synonyms 152 terms based on string siilarity..
Unique terms mapped by string similarity: 73
Total of terms mapped by string similarity: 1992
Saving filtered terms from str mapping...
--------------------END----------------------
Total of terms mapped: 2379
Total of terms unmapped: 1720
/usr/local/lib/python3.6/dist-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
/home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/bin/mapping2MCO_v6.py:313: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
str_matches_odf["SOURCE"] = mco_ifile
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile Input file of npl tagged file: srr_htregulondb_model_Run3_v10_S1_False_S2_True_S3_False_S4_False_Run3_v10.tsv
--iOntoFile Input file with the ontology entities (MCO-terms): gc_ontology_terms_v2.txt
--iLinksFile Input file with links and id for the ontology (MCO-type-links): None
--iSynFile Input file for the additional ontology of synonyms (MCO-syn-json): mco_terms_v0.2.json
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v4/
--outputFile Output of the mapping process: srr_htregulondb.tsv
--minPerMatch Minimal string matching percentage: 80
--minCRFProbs Minimal crf probabilities allowed: 0.9
-------------------------------- INPUTS --------------------------------
npl tagged file
SRR ... REPO_FILE
0 SRR5742248 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
5 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
7 SRR5742250 ... http://pakal.ccg.unam.mx/cmendezc/automatic-ex...
[3 rows x 14 columns]
ontology entities
TERM_ID TERM_NAME
0 MCO000000014 generically dependent continuant
1 MCO000000015 radiation
2 MCO000000016 electromagnetic radiation
additional ontology of synonyms (MCO-syn-json)
ENTITY_NAME TERM_ID TERM_NAME
MCO000000019 continuant MCO000000019
MCO000002475 culture medium MCO000002475
MCO000002467_0 Organism MCO000002467 biologicentity
-------------------------------- RESULTS --------------------------------
Tracking exact terms to MCO...
Mapping 3769 terms to MCO based on exact strings...
Mapping 3440 terms to MCO - synonyms based on exact strings...
Total of terms mapped by exact strings: 387
Saving filtered terms from raw mapping...
3382 unmapped terms based on exact strings
Dropping duplicated unmapped term names...
206 unmapped unique terms based on exact strings
compute string similarty...
Mapping to MCO 206 terms based on string similarity...
Mapping to MCO - synonyms 152 terms based on string siilarity..
Unique terms mapped by string similarity: 73
Total of terms mapped by string similarity: 1668
Saving filtered terms from str mapping...
--------------------END----------------------
Total of terms mapped: 2055
Total of terms unmapped: 1714
This diff could not be displayed because it is too large.
-------------------------------- PARAMETERS --------------------------------
--inputPath Path of npl tagged file: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/input/
--iAnnotatedFile Input file of npl tagged file: No_GSM_Metadata_Selected_v4.tsv
--outputPath Output path to place output files: /home/egaytan/automatic-extraction-growth-conditions/mapping_MCO/output/v2/
--outputFile Output of the mapping process: zika.json
Total zika terms: 2351