format_fun_v6.py
5.89 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from numpy import nan
#from collections import OrderedDict
from pandas import DataFrame as DF
import json
from collections import defaultdict
import format_fun_v6 as format_fun
def to_json(df, source_info, evidence_source, ofname):
df_terms = defaultdict(list)
for idx,row in df.iterrows():
term_record = format_fun.get_term_info(
row,
source = source_info,
map= row.MAP)
df_terms[row.SRR].append(term_record)
df_json = {}
df_tmp = df.drop_duplicates("SRR", keep="first")
for idx,row in df_tmp.iterrows():
srr_record = format_fun.created_record(
info_df = row,
term_list = df_terms[row.SRR],
source = source_info,
esource = evidence_source)
df_json[row.SRR] = srr_record
with open(ofname, "w") as output:
json.dump(df_json, output, separators=(',', ':'), indent=4)
def get_score(info_df):
if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
subtext = "term present on MCO"
else:
mco_mapping = {
"type": "string similarity",
"score": info_df.SET
}
return(mco_mapping)
"""
- **name**: nombre del termino registrado en la MCO
- **term_id**: identificador del termino en RegulonDB (si existe)
- **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
- **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
- **source**: fuente de los datos [ GEO, ]
- **id**: identificador del registro de la base de datos o fuente de datos
- **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
- **associatedPhrase**: Frase de donde se tomo la informacion
"""
def get_term_info(info_df, source, map=True):
info_df = info_df.replace(nan, "", regex=True)
term_dict = {
"name": info_df.TERM_NAME, #NPL output
"term_id" : info_df.TERM_ID, #MCO
"term_type": info_df.TERM_TYPE, #NPL
"source_data": {
"source": source,
"id": info_df.GSM, #NPL
"field": info_df.BANGLINE, #NPL
"associatedPhrase": info_df.FULL_TEXT, #NPL
"description" : get_description(info_df, map),
"similarity_percentage" : info_df.SET
}
}
return(term_dict)
"""
- **objectId**: Identificador en la base de datos fuente
- **externalCrossReferences_name**: nombre de la DB [ GEO ]
"""
def get_crossref_info(info_df, source):
crossref_dict ={
"objectId": info_df.GSM, #NPL
"externalCrossReferences_name" : source
}
return(crossref_dict)
"""
- **evidence_id**: Identificador de RegulondB asociado a la evidencia
- **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
- **pmid**: PubMed ID
"""
def get_cite_info(info_df, esource):
cite_dict ={
"evidence_id": "",
"evidence_name" : esource,
"pmid": info_df.PMID
}
return(cite_dict)
def get_description(info_df, map=True):
if not map:
subtext = "absent in RegulonDB MCO"
elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
subtext = "RegulonDB MCO term"
else:
subtext = "Similar term in RegulonDB MCO"
return(subtext)
"""
#run it in the main for each field
return: type
id: string
name: string
description: string
terms: list of dict
externalCrossReferences: list of dict
citations: list of dict
"""
def created_record(info_df, term_list, source = "GEO", esource = "NPL-CRF"):
#record_dict = OrderedDict()
record_dict = {}
info_df = info_df.replace(nan, "", regex=True)
record_dict["id"] = ""
record_dict["name"] = ""
record_dict["description"] = ""
record_dict["terms"] = term_list
record_dict["externalCrossReferences"] = [get_crossref_info(info_df, source)]
record_dict["citations"] = [get_cite_info(info_df, esource)]
return(record_dict)
def json2DataFrame(data):
mco_syn_dic = dict()
for j,i in enumerate(data):
if "regulondb_id" in i.keys():
if "synonyms" in i.keys():
for k,syn in enumerate(i['synonyms']):
dict_key = i['regulondb_id']+"_"+str(k)
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': syn.lower(),
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
elif "hasRelatedSynonyms" in i.keys():
for k,syn in enumerate(i['hasRelatedSynonyms']):
dict_key = i['regulondb_id']+"_"+str(k)
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': syn.lower(),
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
else:
dict_key = i['regulondb_id']
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': '',
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
mco_syn_df = DF.from_dict(mco_syn_dic).T
return(mco_syn_df)