format_fun_v4.py
4.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
from numpy import nan
from collections import OrderedDict
from pandas import DataFrame as DF
"""
- **name**: nombre del termino registrado en la MCO
- **term_id**: identificador del termino en RegulonDB (si existe)
- **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
- **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
- **source**: fuente de los datos [ GEO, ]
- **id**: identificador del registro de la base de datos o fuente de datos
- **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
- **associatedPhrase**: Frase de donde se tomo la informacion
"""
def get_term_info(info_df, source):
term_dict = {
"name": info_df.TERM_NAME, #NPL output
"term_id" : info_df.TERM_ID, #MCO
"term_type": info_df.TERM_TYPE, #NPL
"source_data": info_df.REPO_FILE, #NPL
"source": source,
"id": info_df.GSM, #NPL
"field": info_df.BANGLINE, #NPL
"associatedPhrase": info_df.FULL_TEXT #NPL
}
return(term_dict)
"""
- **objectId**: Identificador en la base de datos fuente
- **externalCrossReferences_name**: nombre de la DB [ GEO ]
"""
def get_crossref_info(info_df, source):
crossref_dict ={
"objectId": info_df.GSM, #NPL
"externalCrossReferences_name" : source
}
return(crossref_dict)
"""
- **evidence_id**: Identificador de RegulondB asociado a la evidencia
- **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
- **pmid**: PubMed ID
"""
def get_cite_info(info_df, esource):
cite_dict ={
"evidence_id": "",
"evidence_name" : esource,
"pmid": info_df.PMID
}
return(cite_dict)
def get_description(info_df, no_map=False):
if(no_map):
mco_mapping = {
"type": "not present on MCO"
}
elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
mco_mapping = {
"type": "term present on MCO"
}
else:
mco_mapping = {
"type": "string similarity",
"score": info_df.SET
}
return(mco_mapping)
"""
#run it in the main for each field
return: type
id: string
name: string
description: string
terms: list of dict
externalCrossReferences: list of dict
citations: list of dict
"""
def created_record(term_info_df, source = "GEO", no_map = False, esource = "NPL-CRF"):
record_dict = OrderedDict()
term_info_df = term_info_df.replace(nan, '', regex=True)
record_dict["id"] = term_info_df.TERM_ID #it should be add if it have been mapped
record_dict["name"] = term_info_df.TERM_NAME #a colum form NPL output
record_dict["description"] = [get_description(term_info_df, no_map=no_map)]
record_dict["terms"] = [get_term_info(term_info_df, source)]
record_dict["externalCrossReferences"] = [get_crossref_info(term_info_df, source)]
record_dict["citations"] = [get_cite_info(term_info_df, esource)]
return(record_dict)
def json2DataFrame(data):
mco_syn_dic = dict()
for j,i in enumerate(data):
if "regulondb_id" in i.keys():
if "synonyms" in i.keys():
for k,syn in enumerate(i['synonyms']):
dict_key = i['regulondb_id']+"_"+str(k)
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': syn.lower(),
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
elif "hasRelatedSynonyms" in i.keys():
for k,syn in enumerate(i['hasRelatedSynonyms']):
dict_key = i['regulondb_id']+"_"+str(k)
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': syn.lower(),
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
else:
dict_key = i['regulondb_id']
mco_syn_dic[dict_key] = {
#ENTITY_NAME
'ENTITY_NAME' : i['name'],
#ENITY_SYN
'TERM_NAME': '',
#regulondb_id
'TERM_ID' : i['regulondb_id'] }
mco_syn_df = DF.from_dict(mco_syn_dic).T
return(mco_syn_df)