mapping_fun.py 13 KB
from fuzzywuzzy import fuzz
from pandas import DataFrame as DF


"""
Params
----------
    term_type:
        str with the term type abbreviation (CRF tags)

Example
----------
transterm_npl2mco("Org")

Missing MCO
----------
    Pressure 
    Growth rate

Missing NPL
----------
    Strain #mapped to Organims
    Substrain #mapped to Organims
    Technique


Valid terms MCO link
----------
Aeration
Genetic background
Growth phase
Medium
Medium supplement
Optical Density (OD)
Organism
Pressure
Temperature
pH

"""
def transterm_npl2mco( term_type ):    
    
    MCO_term_type_d = {
    "Strain": "Organism",
    "Substrain": "Organism",
    "Gtype": "Genetic background", 
    "Med": "Medium", 
    "Supp": "Medium supplement",
    "Air": "Aeration", 
    "Temp": "Temperature",
    "pH": "pH", 
    "OD": "Optical Density (OD)", 
    "Phase": "Growth phase", 
    "Vess": "Vessel type", 
    "Agit": "Agitation speed",
    "exTag": "exTag Type",
    "Anti": "exTag Type",
    "Technique": "exTag Type"
    }
    if( term_type in MCO_term_type_d.keys()):
        term = MCO_term_type_d[ term_type ]
    else:
        term = term_type    
    return(term)

"""
Description
----------
This function maps those growth conditions extracted automatic with
the NPL annotation framework to ontology entities provided.

Params
----------
    npl_df: 
        a data frame with the automatic tagging from the npl workflow with
        at least the following columns:
        TERM_NAME: 
            a string with a text fragment corresponding to the automatic 
            annotated GC-term by the NPL-workflow.
        TERM_TYPE:          
            a string with the term type abbreviation assigned by the 
            NPL-workflow
    mco_df:
        a data frame with the ontology data base with at least the following
        columns:
        TERM_NAME: 
            a string with a text fragment corresponding to case entities 
            annotated of GC-terms in the ontology data base
        TERM_ID: 
            identif(iers for case entities of GC-terms available on the) 
            ontology data base
    mco_links: 
        a data frame with the links to classif(y the term based on identif(iers) 
        of the ontology data base with at least the following columns:      
        TERM_ID: 
            identif(iers for case entities of GC-terms available on the) 
            ontology data base
        TERM_TYPE: 
            a string with the term type according to the ontology data base 
    
Output
----------
A data frame with the GC-terms the additional info from ontology data base 
for those maped terms.


Example
----------
import pandas as pd
import format_fun
import mapping_fun

ex_dic = {"GSE": ["GSEnnn", "GSEnnn", "GSEnnn"],
    "GSM": ["GSMnnn", "GSMnnn", "GSMnnn"],
    "GPL_PMID": ["GPLnnn-PMID:nnn","GPLnnn-PMID:nnn","GPLnnn-PMID:nnn"],
    "BANGLINE": ["growth_protocol_ch1.1", "growth_protocol_ch1.1", "growth_protocol_ch1.1"],
    "FULLTEXT": ["Loreum loreum <tag> loreum 37 </tag>","Loreum loreum <tag> loreum 37 </tag>","Loreum loreum <tag> loreum 37 </tag>"],
    "TERM_NAME": ["loreum 37","loreum 37","loreum 37"],
    "TERM_TYPE": ["exTag Type", "exTag Type", "exTag Type"]
}
     
npl_df = pd.DataFrame(data=ex_dic)

ex_dic = {"TERM_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "TERM_CLASS_ID": ["XXXnnnn", "XXXnnnn", "XXXnnnn"],
         "TERM_CLASS_PARENT_ID": ["XXXnnnn","XXXnnnn","XXXnnnn"],
         "TERM_NAME": ["loreum 37","loreum 37","loreum 37"],
         "TERM_DESCRIPTION": ["Loreum loreum","Loreum loreum","Loreum loreum"],
         "TERM_HEAD": ["Loreum loreum", "Loreum loreum", "Loreum loreum"], 
         }
mco_df = pd.DataFrame(data=ex_dic)
         
ex_dic = {"TERM_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "GC_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "TERM_TYPE": ["exTag Type", "exTag Type", "exTag Type"],
         "TERM_ORDER": ["Loreum", "Loreum", "Loreum"], 
         }
mco_links =  pd.DataFrame(data=ex_dic)

mapping_fun.raw_map_mco(npl_df, mco_df, mco_links = None)
mapping_fun.raw_map_mco(npl_df, mco_df, mco_links = mco_links)

    
"""
def raw_map_mco(npl_df, mco_df, mco_links = None, unmap = False):
    merge_columns = ["TERM_NAME"]

    if( mco_links is not None ):
        merge_columns = ["TERM_NAME", "TERM_TYPE"]
        mco_df = DF.merge(mco_df, mco_links, how='inner', on = ["TERM_ID"])
        
    if( unmap ):
        #merge full to return all available with id and no available with NA
        npl_c = DF.merge(npl_df, mco_df,  how='left', on=merge_columns)
    else:
        #return all terms with an available id
        npl_c = DF.merge(npl_df, mco_df,  how='inner', on=merge_columns)
        #npl_c = npl_c[npl_c.TERM_ID.notnull()]
        
    return(npl_c)
    
"""
Descrption
----------

This wrapper function for token functions of fuzzyWuzzy calculates string 
similarities between an input string, a text fragment of a growth condition 
(term_name), and a list of string cases by computing two similarity scores 
with the token_sort_ratio() and the token_set_ratio() functions. Finally, 
it returns a matching with the largest set score.


Params
----------
    term_name:
        a string with a text fragment corresponding to the annotated 
        GC-term in the MCO from RegulonDB.
    cases:
        a list of strings with the cases to compare.

Output
----------
A list
    1: matching case
    2: set score (score from token_set_ratio)
    3: sort score (score from token_sort_ratio)

Notes
----------
Token functions 

Tokenize the strings, change capitals to lowercase, and remove punctuation. 
It first sorts strings alphabetically and then joins them together. Finally, 
the fuzz.ratio() is calculated. 

token_sort_ratio()
    Compare strings with the same in spelling but not in dif(ferent order).
token_set_ratio()
    Compare strings with signif(icant dif(ference in lengthss).
    it removes the common tokens before calculating the fuzz.ratio()
  
"""
def token_ratio_wrap(term_name, cases):
        
  #odf = DF(cases)
  #odf["set_ratio"] = [fuzz.token_set_ratio(term_name, case) for case in cases]
  #odf["sort_ratio"] = [fuzz.token_sort_ratio(term_name, case) for case in cases]
  odf = [ [row.TERM_NAME, fuzz.token_set_ratio(term_name, row.TERM_NAME), fuzz.token_sort_ratio(term_name, row.TERM_NAME), row.TERM_ID] for idx,row in cases[["TERM_NAME", "TERM_ID"]].iterrows()]  
  odf = DF(odf, columns=["case_term_name", "set_ratio", "sort_ratio", "id"])
  
  match_case = odf.sort_values(by = ["sort_ratio"], ascending = False)

  #print(match_case.head())
  match_case = list(match_case.iloc[0,:])
  match_case = [term_name] + match_case
  #print(match_case)
  return(match_case)

"""
Descrption
----------
This function calculates string similarity between a text fragment of a
growth condition (term_name) and the comparison cases. It first, loads
properly the string cases (mco_df) and then perform a string matching
with the token_ratio_wrap function.

Params
----------
    term_name: 
        a string with a text fragment corresponding to the annotated
        GC-term in the MCO from RegulonDB.
    term_type: 
        str with the term type abbreviation (CRF tags)
    mco_df: ontology data base
        name:
            term name
        synonyms:
            all related synoninms
        _id:
            associated term name regulonBD id
        ontologies_id:
            ontolgies id, a columns with empty fields
        oboId:
            id from Open Biological and Biomedical Ontologies, a column
            with empty fields.
    ncol_cases:
        a integer, the number for the column with the cases to match
        in the mco_df (ontology data base)

Output
----------
A list
    1: matching case
    2: set score (score from token_set_ratio)
    3: sort score (score from token_sort_ratio)

Notes
----------
"""
def sym_score(term_name, ttype, mco_df, ncol_cases):
  #FILTER term type cases
  
  if( ttype !="full" ):
    mco_df = mco_df[mco_df.TERM_TYPE==ttype]
    
    if (mco_df.empty):
        empty_match = [term_name, "", 0, 0, ""]
        return(empty_match)
  
  #mco_df = mco_df.iloc[:,ncol_cases]

  match_hit_list = token_ratio_wrap(term_name, mco_df)

  return(match_hit_list)

"""
Descrption
----------
This function maps those growth conditions extracted automatic with
the NPL annotation framework to ontology entities provided by using
string similarities. Then, it filters the best hits based on a 
minimal similarity score

Params
----------
    npl_df: 
        a data frame with the automatic tagging from the npl workflow with
        at least the following columns:
        TERM_NAME: 
            a string with a text fragment corresponding to the automatic 
            annotated GC-term by the NPL-workflow.
        TERM_TYPE:          
            a string with the term type abbreviation assigned by the 
            NPL-workflow
    mco_df:
        a data frame with the ontology data base with at least the following
        columns:
        TERM_NAME: 
            a string with a text fragment corresponding to case entities 
            annotated of GC-terms in the ontology data base
        TERM_ID: 
            identif(iers for case entities of GC-terms available on the) 
            ontology data base
    mco_links: 
        a data frame with the links to classif(y the term based on identif(iers) 
        of the ontology data base with at least the following columns:      
        TERM_ID: 
            identif(iers for case entities of GC-terms available on the) 
            ontology data base
        TERM_TYPE: 
            a string with the term type according to the ontology data base
    min_match:
        an integer with the minimal similarity score to filter best matching

Output
----------
A data frame with the GC-terms the additional info from ontology data base 
for those maped terms and additional columns with the similarity scores.

Example
----------
import pandas as pd
import format_fun
import mapping_fun

ex_dic = {"GSE": ["GSEnnn", "GSEnnn", "GSEnnn"],
    "GSM": ["GSMnnn", "GSMnnn", "GSMnnn"],
    "GPL_PMID": ["GPLnnn-PMID:nnn","GPLnnn-PMID:nnn","GPLnnn-PMID:nnn"],
    "BANGLINE": ["growth_protocol_ch1.1", "growth_protocol_ch1.1", "growth_protocol_ch1.1"],
    "FULLTEXT": ["Loreum loreum <tag> loreum 37 </tag>","Loreum loreum <tag> loreum 37 </tag>","Loreum loreum <tag> loreum 37 </tag>"],
    "TERM_NAME": ["loreum 37","loreum 37","loreum 37"],
    "TERM_TYPE": ["exTag Type", "exTag Type", "exTag Type"]
}
     
npl_df = pd.DataFrame(data=ex_dic)

ex_dic = {"TERM_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "TERM_CLASS_ID": ["XXXnnnn", "XXXnnnn", "XXXnnnn"],
         "TERM_CLASS_PARENT_ID": ["XXXnnnn","XXXnnnn","XXXnnnn"],
         "TERM_NAME": ["loreum 37","loreum 37","loreum 37"],
         "TERM_DESCRIPTION": ["Loreum loreum","Loreum loreum","Loreum loreum"],
         "TERM_HEAD": ["Loreum loreum", "Loreum loreum", "Loreum loreum"], 
         }
mco_df = pd.DataFrame(data=ex_dic)
         
ex_dic = {"TERM_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "GC_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "TERM_TYPE": ["exTag Type", "exTag Type", "exTag Type"],
         "TERM_ORDER": ["Loreum", "Loreum", "Loreum"], 
         }
mco_links =  pd.DataFrame(data=ex_dic)

mapping_fun.str_match_map_mco(npl_df, mco_df, mco_links = None,  min_match=None)
mapping_fun.str_match_map_mco(npl_df, mco_df, mco_links = None,  min_match=2)

Notes
----------
mco_links
"""
def str_match_map_mco(npl_df, mco_df, mco_links = None,  min_match=None, npl_merges = True):
        
    npl_columns = list(npl_df.columns)
    
    ntname = npl_columns.index("TERM_NAME")
    

    merge_columns = ["TERM_NAME"]

    if mco_links is not None:
        nttype = npl_columns.index("TERM_TYPE")
        #merge_columns = ["TERM_NAME", "TERM_TYPE"]
        mco_df = DF.merge(mco_df, mco_links, on=["TERM_ID"])

        mco_columns = list(mco_df.columns)
        case_col = mco_columns.index("TERM_NAME")
        ntid = mco_columns.index("TERM_ID")


        #print(mco_df.head(3))
        scores_list = [sym_score(term_name = cols[ntname], ttype =cols[nttype], mco_df = mco_df, ncol_cases = case_col) for idx,cols in npl_df.iterrows()]        
    
    else:
        mco_columns = list(mco_df.columns)
        case_col = mco_columns.index("TERM_NAME")        

        scores_list = [sym_score(term_name = cols[ntname], ttype ="full", mco_df=mco_df, ncol_cases=case_col) for idx,cols in npl_df.iterrows()]        
    
    
    match_scores_df = DF(scores_list, columns=["TERM_NAME", "CASE_MATCH", "SET", "SORT", "TERM_ID"])
    
    if npl_merges:
        npl_matches = DF.merge(npl_df, match_scores_df, on = merge_columns)
    else:
        npl_matches = match_scores_df
    if min_match is not None:
        npl_matches = npl_matches[npl_matches.SET>min_match]
        
    return(npl_matches)