format_fun_v6.py 5.89 KB
from numpy import nan
#from collections import OrderedDict
from pandas import DataFrame as DF
import json
from collections import defaultdict
import format_fun_v6 as format_fun

def to_json(df, source_info, evidence_source, ofname):
    df_terms = defaultdict(list)
        
    for idx,row in df.iterrows():
        term_record = format_fun.get_term_info(
            row, 
            source = source_info, 
            map= row.MAP)
        df_terms[row.SRR].append(term_record)        
    
    df_json = {}
    df_tmp = df.drop_duplicates("SRR", keep="first")
    for idx,row in df_tmp.iterrows():        
        srr_record = format_fun.created_record(
            info_df = row, 
            term_list = df_terms[row.SRR],
            source = source_info,
            esource = evidence_source)        
        df_json[row.SRR] = srr_record

    with open(ofname, "w") as output:
        json.dump(df_json, output, separators=(',', ':'), indent=4)

def get_score(info_df):
    if info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
        subtext = "term present on MCO"
    else:
        mco_mapping = {
            "type": "string similarity",
            "score": info_df.SET
        }
    return(mco_mapping)
"""
    - **name**:  nombre del termino registrado en la MCO
    - **term_id**: identificador del termino en RegulonDB (si existe)
    - **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
    - **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
    - **source**: fuente de los datos [ GEO,  ]
    - **id**: identificador del registro de la base de datos o fuente de datos
    - **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
    - **associatedPhrase**: Frase de donde se tomo la informacion
"""
def get_term_info(info_df, source, map=True):
    info_df = info_df.replace(nan, "", regex=True)
    term_dict = {        
        "name": info_df.TERM_NAME, #NPL output
        "term_id" : info_df.TERM_ID, #MCO
        "term_type": info_df.TERM_TYPE, #NPL
        "source_data": {
            "source":  source,
            "id": info_df.GSM, #NPL
            "field":  info_df.BANGLINE, #NPL
            "associatedPhrase":  info_df.FULL_TEXT, #NPL
            "description" : get_description(info_df, map),
            "similarity_percentage" : info_df.SET
        }
    }
    return(term_dict)


"""
    - **objectId**: Identificador en la base de datos fuente
    - **externalCrossReferences_name**: nombre de la DB [ GEO ]
"""
def get_crossref_info(info_df, source):
    crossref_dict ={        
        "objectId": info_df.GSM, #NPL
        "externalCrossReferences_name" : source
    }
    return(crossref_dict)

"""
  - **evidence_id**: Identificador de RegulondB asociado a la evidencia
  - **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
  - **pmid**: PubMed ID
"""
def get_cite_info(info_df, esource):
    cite_dict ={        
        "evidence_id": "",
        "evidence_name" : esource,
        "pmid": info_df.PMID
    }
    return(cite_dict)

def get_description(info_df, map=True):
    if not map:
        subtext = "absent in RegulonDB MCO"
    elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
        subtext = "RegulonDB MCO term"
    else:
        subtext = "Similar term in RegulonDB MCO"
    return(subtext)
"""
#run it in the main for each field


return: type

id: string
name: string
description: string
terms: list of dict
externalCrossReferences: list of dict
citations: list of dict

"""

def created_record(info_df, term_list, source = "GEO", esource = "NPL-CRF"):
    #record_dict = OrderedDict()    
    record_dict = {}
    info_df = info_df.replace(nan, "", regex=True)
    record_dict["id"] = ""
    record_dict["name"] = ""
    record_dict["description"] = ""
    record_dict["terms"] = term_list
    record_dict["externalCrossReferences"] = [get_crossref_info(info_df, source)]
    record_dict["citations"] = [get_cite_info(info_df, esource)]    
    
    return(record_dict)

def json2DataFrame(data):
    mco_syn_dic = dict()

    for j,i in enumerate(data):    
        if "regulondb_id" in i.keys():
            
            if "synonyms" in i.keys():
                
                for k,syn in enumerate(i['synonyms']):
                    dict_key = i['regulondb_id']+"_"+str(k)
                    mco_syn_dic[dict_key] = {
                        #ENTITY_NAME
                        'ENTITY_NAME' : i['name'], 
                        #ENITY_SYN
                        'TERM_NAME': syn.lower(),
                        #regulondb_id
                        'TERM_ID' : i['regulondb_id'] }

            elif "hasRelatedSynonyms" in i.keys(): 

                for k,syn in enumerate(i['hasRelatedSynonyms']):
                    dict_key = i['regulondb_id']+"_"+str(k)
                    mco_syn_dic[dict_key] = {
                        #ENTITY_NAME
                        'ENTITY_NAME' : i['name'], 
                        #ENITY_SYN
                        'TERM_NAME': syn.lower(),
                        #regulondb_id
                        'TERM_ID' : i['regulondb_id'] }
            else:
                dict_key = i['regulondb_id']
                mco_syn_dic[dict_key] = {
                    #ENTITY_NAME
                    'ENTITY_NAME' : i['name'], 
                    #ENITY_SYN
                    'TERM_NAME': '',
                    #regulondb_id
                    'TERM_ID' : i['regulondb_id'] }
            
    mco_syn_df = DF.from_dict(mco_syn_dic).T


    return(mco_syn_df)