format_fun_v4.py 4.87 KB
from numpy import nan
from collections import OrderedDict
from pandas import DataFrame as DF
"""
    - **name**:  nombre del termino registrado en la MCO
    - **term_id**: identificador del termino en RegulonDB (si existe)
    - **term_type**: tipo de termino, uno de los siguientes: "Organism", "Genetic background", "Medium", Medium supplement", "Aeration", "Temperature","pH", "Pressure", "Optical Density (OD)", "Growth phase", "Growth rate", "Vessel type", "Agitation speed"
    - **source_data**: subdocumento que contiene informacion de GEO de donde se extrajo la informacion de la GC
    - **source**: fuente de los datos [ GEO,  ]
    - **id**: identificador del registro de la base de datos o fuente de datos
    - **field**: campo de donde se esta tomando la informacion de la GC [ metadata field]
    - **associatedPhrase**: Frase de donde se tomo la informacion
"""
def get_term_info(info_df, source):
    term_dict = {        
        "name": info_df.TERM_NAME, #NPL output
        "term_id" : info_df.TERM_ID, #MCO
        "term_type": info_df.TERM_TYPE, #NPL
        "source_data": info_df.REPO_FILE, #NPL
        "source":  source,
        "id": info_df.GSM, #NPL
        "field":  info_df.BANGLINE, #NPL
        "associatedPhrase":  info_df.FULL_TEXT #NPL
    }
    return(term_dict)


"""
    - **objectId**: Identificador en la base de datos fuente
    - **externalCrossReferences_name**: nombre de la DB [ GEO ]
"""
def get_crossref_info(info_df, source):
    crossref_dict ={        
        "objectId": info_df.GSM, #NPL
        "externalCrossReferences_name" : source
    }
    return(crossref_dict)

"""
  - **evidence_id**: Identificador de RegulondB asociado a la evidencia
  - **evidence_name**: nombre de la evidencia, si es que no cuenta con el identificador
  - **pmid**: PubMed ID
"""
def get_cite_info(info_df, esource):
    cite_dict ={        
        "evidence_id": "",
        "evidence_name" : esource,
        "pmid": info_df.PMID
    }
    return(cite_dict)

def get_description(info_df, no_map=False):
    if(no_map):
        mco_mapping = {
            "type": "not present on MCO"
        }
    elif info_df.CASE_MATCH == "MCO" and info_df.SET == 100 and info_df.SORT == 100:
        mco_mapping = {
            "type": "term present on MCO"
            }
    else:
        mco_mapping = {
            "type": "string similarity",
            "score": info_df.SET
        }
    return(mco_mapping)
"""
#run it in the main for each field


return: type

id: string
name: string
description: string
terms: list of dict
externalCrossReferences: list of dict
citations: list of dict

"""

def created_record(term_info_df, source = "GEO", no_map = False, esource = "NPL-CRF"):
    record_dict = OrderedDict()
    term_info_df = term_info_df.replace(nan, '', regex=True)
    record_dict["id"] = term_info_df.TERM_ID #it should be add if it have been mapped
    record_dict["name"] = term_info_df.TERM_NAME #a colum form NPL output
    record_dict["description"] = [get_description(term_info_df, no_map=no_map)]
    record_dict["terms"] = [get_term_info(term_info_df, source)]
    record_dict["externalCrossReferences"] = [get_crossref_info(term_info_df, source)]
    record_dict["citations"] = [get_cite_info(term_info_df, esource)]    
    
    return(record_dict)

def json2DataFrame(data):
    mco_syn_dic = dict()

    for j,i in enumerate(data):    
        if "regulondb_id" in i.keys():
            
            if "synonyms" in i.keys():
                
                for k,syn in enumerate(i['synonyms']):
                    dict_key = i['regulondb_id']+"_"+str(k)
                    mco_syn_dic[dict_key] = {
                        #ENTITY_NAME
                        'ENTITY_NAME' : i['name'], 
                        #ENITY_SYN
                        'TERM_NAME': syn.lower(),
                        #regulondb_id
                        'TERM_ID' : i['regulondb_id'] }

            elif "hasRelatedSynonyms" in i.keys(): 

                for k,syn in enumerate(i['hasRelatedSynonyms']):
                    dict_key = i['regulondb_id']+"_"+str(k)
                    mco_syn_dic[dict_key] = {
                        #ENTITY_NAME
                        'ENTITY_NAME' : i['name'], 
                        #ENITY_SYN
                        'TERM_NAME': syn.lower(),
                        #regulondb_id
                        'TERM_ID' : i['regulondb_id'] }
            else:
                dict_key = i['regulondb_id']
                mco_syn_dic[dict_key] = {
                    #ENTITY_NAME
                    'ENTITY_NAME' : i['name'], 
                    #ENITY_SYN
                    'TERM_NAME': '',
                    #regulondb_id
                    'TERM_ID' : i['regulondb_id'] }
            
    mco_syn_df = DF.from_dict(mco_syn_dic).T


    return(mco_syn_df)