mapping_fun.py 13 KB

Raw Blame History Permalink

from fuzzywuzzy import fuzz
from pandas import DataFrame as DF


"""
Params
----------
    term_type:
        str with the term type abbreviation (CRF tags)

Example
----------
transterm_npl2mco("Org")

Missing MCO
----------
    Pressure
    Growth rate

Missing NPL
----------
    Strain #mapped to Organims
    Substrain #mapped to Organims
    Technique


Valid terms MCO link
----------
Aeration
Genetic background
Growth phase
Medium
Medium supplement
Optical Density (OD)
Organism
Pressure
Temperature
pH

"""
def transterm_npl2mco( term_type ):

    MCO_term_type_d = {
    "Strain": "Organism",
    "Substrain": "Organism",
    "Gtype": "Genetic background",
    "Med": "Medium",
    "Supp": "Medium supplement",
    "Air": "Aeration",
    "Temp": "Temperature",
    "pH": "pH",
    "OD": "Optical Density (OD)",
    "Phase": "Growth phase",
    "Vess": "Vessel type",
    "Agit": "Agitation speed",
    "exTag": "exTag Type",
    "Anti": "exTag Type",
    "Technique": "exTag Type"
    }
    if( term_type in MCO_term_type_d.keys()):
        term = MCO_term_type_d[ term_type ]
    else:
        term = term_type
    return(term)

"""
Description
----------
This function maps those growth conditions extracted automatic with
the NPL annotation framework to ontology entities provided.

Params
----------
    npl_df:
        a data frame with the automatic tagging from the npl workflow with
        at least the following columns:
        TERM_NAME:
            a string with a text fragment corresponding to the automatic
            annotated GC-term by the NPL-workflow.
        TERM_TYPE:
            a string with the term type abbreviation assigned by the
            NPL-workflow
    mco_df:
        a data frame with the ontology data base with at least the following
        columns:
        TERM_NAME:
            a string with a text fragment corresponding to case entities
            annotated of GC-terms in the ontology data base
        TERM_ID:
            identif(iers for case entities of GC-terms available on the)
            ontology data base
    mco_links:
        a data frame with the links to classif(y the term based on identif(iers)
        of the ontology data base with at least the following columns:
        TERM_ID:
            identif(iers for case entities of GC-terms available on the)
            ontology data base
        TERM_TYPE:
            a string with the term type according to the ontology data base

Output
----------
A data frame with the GC-terms the additional info from ontology data base
for those maped terms.


Example
----------
import pandas as pd
import format_fun
import mapping_fun

ex_dic = {"GSE": ["GSEnnn", "GSEnnn", "GSEnnn"],
    "GSM": ["GSMnnn", "GSMnnn", "GSMnnn"],
    "GPL_PMID": ["GPLnnn-PMID:nnn","GPLnnn-PMID:nnn","GPLnnn-PMID:nnn"],
    "BANGLINE": ["growth_protocol_ch1.1", "growth_protocol_ch1.1", "growth_protocol_ch1.1"],
    "FULLTEXT": ["Loreum loreum <tag> loreum 37 </tag>","Loreum loreum <tag> loreum 37 </tag>","Loreum loreum <tag> loreum 37 </tag>"],
    "TERM_NAME": ["loreum 37","loreum 37","loreum 37"],
    "TERM_TYPE": ["exTag Type", "exTag Type", "exTag Type"]
}

npl_df = pd.DataFrame(data=ex_dic)

ex_dic = {"TERM_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "TERM_CLASS_ID": ["XXXnnnn", "XXXnnnn", "XXXnnnn"],
         "TERM_CLASS_PARENT_ID": ["XXXnnnn","XXXnnnn","XXXnnnn"],
         "TERM_NAME": ["loreum 37","loreum 37","loreum 37"],
         "TERM_DESCRIPTION": ["Loreum loreum","Loreum loreum","Loreum loreum"],
         "TERM_HEAD": ["Loreum loreum", "Loreum loreum", "Loreum loreum"],
         }
mco_df = pd.DataFrame(data=ex_dic)

ex_dic = {"TERM_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "GC_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "TERM_TYPE": ["exTag Type", "exTag Type", "exTag Type"],
         "TERM_ORDER": ["Loreum", "Loreum", "Loreum"],
         }
mco_links =  pd.DataFrame(data=ex_dic)

mapping_fun.raw_map_mco(npl_df, mco_df, mco_links = None)
mapping_fun.raw_map_mco(npl_df, mco_df, mco_links = mco_links)


"""
def raw_map_mco(npl_df, mco_df, mco_links = None, unmap = False):
    merge_columns = ["TERM_NAME"]

    if( mco_links is not None ):
        merge_columns = ["TERM_NAME", "TERM_TYPE"]
        mco_df = DF.merge(mco_df, mco_links, how='inner', on = ["TERM_ID"])

    if( unmap ):
        #merge full to return all available with id and no available with NA
        npl_c = DF.merge(npl_df, mco_df,  how='left', on=merge_columns)
    else:
        #return all terms with an available id
        npl_c = DF.merge(npl_df, mco_df,  how='inner', on=merge_columns)
        #npl_c = npl_c[npl_c.TERM_ID.notnull()]

    return(npl_c)

"""
Descrption
----------

This wrapper function for token functions of fuzzyWuzzy calculates string
similarities between an input string, a text fragment of a growth condition
(term_name), and a list of string cases by computing two similarity scores
with the token_sort_ratio() and the token_set_ratio() functions. Finally,
it returns a matching with the largest set score.


Params
----------
    term_name:
        a string with a text fragment corresponding to the annotated
        GC-term in the MCO from RegulonDB.
    cases:
        a list of strings with the cases to compare.

Output
----------
A list
    1: matching case
    2: set score (score from token_set_ratio)
    3: sort score (score from token_sort_ratio)

Notes
----------
Token functions

Tokenize the strings, change capitals to lowercase, and remove punctuation.
It first sorts strings alphabetically and then joins them together. Finally,
the fuzz.ratio() is calculated.

token_sort_ratio()
    Compare strings with the same in spelling but not in dif(ferent order).
token_set_ratio()
    Compare strings with signif(icant dif(ference in lengthss).
    it removes the common tokens before calculating the fuzz.ratio()

"""
def token_ratio_wrap(term_name, cases):

  #odf = DF(cases)
  #odf["set_ratio"] = [fuzz.token_set_ratio(term_name, case) for case in cases]
  #odf["sort_ratio"] = [fuzz.token_sort_ratio(term_name, case) for case in cases]
  odf = [ [row.TERM_NAME, fuzz.token_set_ratio(term_name, row.TERM_NAME), fuzz.token_sort_ratio(term_name, row.TERM_NAME), row.TERM_ID] for idx,row in cases[["TERM_NAME", "TERM_ID"]].iterrows()]
  odf = DF(odf, columns=["case_term_name", "set_ratio", "sort_ratio", "id"])

  match_case = odf.sort_values(by = ["sort_ratio"], ascending = False)

  #print(match_case.head())
  match_case = list(match_case.iloc[0,:])
  match_case = [term_name] + match_case
  #print(match_case)
  return(match_case)

"""
Descrption
----------
This function calculates string similarity between a text fragment of a
growth condition (term_name) and the comparison cases. It first, loads
properly the string cases (mco_df) and then perform a string matching
with the token_ratio_wrap function.

Params
----------
    term_name:
        a string with a text fragment corresponding to the annotated
        GC-term in the MCO from RegulonDB.
    term_type:
        str with the term type abbreviation (CRF tags)
    mco_df: ontology data base
        name:
            term name
        synonyms:
            all related synoninms
        _id:
            associated term name regulonBD id
        ontologies_id:
            ontolgies id, a columns with empty fields
        oboId:
            id from Open Biological and Biomedical Ontologies, a column
            with empty fields.
    ncol_cases:
        a integer, the number for the column with the cases to match
        in the mco_df (ontology data base)

Output
----------
A list
    1: matching case
    2: set score (score from token_set_ratio)
    3: sort score (score from token_sort_ratio)

Notes
----------
"""
def sym_score(term_name, ttype, mco_df, ncol_cases):
  #FILTER term type cases

  if( ttype !="full" ):
    mco_df = mco_df[mco_df.TERM_TYPE==ttype]

    if (mco_df.empty):
        empty_match = [term_name, "", 0, 0, ""]
        return(empty_match)

  #mco_df = mco_df.iloc[:,ncol_cases]

  match_hit_list = token_ratio_wrap(term_name, mco_df)

  return(match_hit_list)

"""
Descrption
----------
This function maps those growth conditions extracted automatic with
the NPL annotation framework to ontology entities provided by using
string similarities. Then, it filters the best hits based on a
minimal similarity score

Params
----------
    npl_df:
        a data frame with the automatic tagging from the npl workflow with
        at least the following columns:
        TERM_NAME:
            a string with a text fragment corresponding to the automatic
            annotated GC-term by the NPL-workflow.
        TERM_TYPE:
            a string with the term type abbreviation assigned by the
            NPL-workflow
    mco_df:
        a data frame with the ontology data base with at least the following
        columns:
        TERM_NAME:
            a string with a text fragment corresponding to case entities
            annotated of GC-terms in the ontology data base
        TERM_ID:
            identif(iers for case entities of GC-terms available on the)
            ontology data base
    mco_links:
        a data frame with the links to classif(y the term based on identif(iers)
        of the ontology data base with at least the following columns:
        TERM_ID:
            identif(iers for case entities of GC-terms available on the)
            ontology data base
        TERM_TYPE:
            a string with the term type according to the ontology data base
    min_match:
        an integer with the minimal similarity score to filter best matching

Output
----------
A data frame with the GC-terms the additional info from ontology data base
for those maped terms and additional columns with the similarity scores.

Example
----------
import pandas as pd
import format_fun
import mapping_fun

ex_dic = {"GSE": ["GSEnnn", "GSEnnn", "GSEnnn"],
    "GSM": ["GSMnnn", "GSMnnn", "GSMnnn"],
    "GPL_PMID": ["GPLnnn-PMID:nnn","GPLnnn-PMID:nnn","GPLnnn-PMID:nnn"],
    "BANGLINE": ["growth_protocol_ch1.1", "growth_protocol_ch1.1", "growth_protocol_ch1.1"],
    "FULLTEXT": ["Loreum loreum <tag> loreum 37 </tag>","Loreum loreum <tag> loreum 37 </tag>","Loreum loreum <tag> loreum 37 </tag>"],
    "TERM_NAME": ["loreum 37","loreum 37","loreum 37"],
    "TERM_TYPE": ["exTag Type", "exTag Type", "exTag Type"]
}

npl_df = pd.DataFrame(data=ex_dic)

ex_dic = {"TERM_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "TERM_CLASS_ID": ["XXXnnnn", "XXXnnnn", "XXXnnnn"],
         "TERM_CLASS_PARENT_ID": ["XXXnnnn","XXXnnnn","XXXnnnn"],
         "TERM_NAME": ["loreum 37","loreum 37","loreum 37"],
         "TERM_DESCRIPTION": ["Loreum loreum","Loreum loreum","Loreum loreum"],
         "TERM_HEAD": ["Loreum loreum", "Loreum loreum", "Loreum loreum"],
         }
mco_df = pd.DataFrame(data=ex_dic)

ex_dic = {"TERM_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "GC_ID": ["MCOnnnn", "MCOnnnn", "MCOnnnn"],
         "TERM_TYPE": ["exTag Type", "exTag Type", "exTag Type"],
         "TERM_ORDER": ["Loreum", "Loreum", "Loreum"],
         }
mco_links =  pd.DataFrame(data=ex_dic)

mapping_fun.str_match_map_mco(npl_df, mco_df, mco_links = None,  min_match=None)
mapping_fun.str_match_map_mco(npl_df, mco_df, mco_links = None,  min_match=2)

Notes
----------
mco_links
"""
def str_match_map_mco(npl_df, mco_df, mco_links = None,  min_match=None, npl_merges = True):

    npl_columns = list(npl_df.columns)

    ntname = npl_columns.index("TERM_NAME")


    merge_columns = ["TERM_NAME"]

    if mco_links is not None:
        nttype = npl_columns.index("TERM_TYPE")
        #merge_columns = ["TERM_NAME", "TERM_TYPE"]
        mco_df = DF.merge(mco_df, mco_links, on=["TERM_ID"])

        mco_columns = list(mco_df.columns)
        case_col = mco_columns.index("TERM_NAME")
        ntid = mco_columns.index("TERM_ID")


        #print(mco_df.head(3))
        scores_list = [sym_score(term_name = cols[ntname], ttype =cols[nttype], mco_df = mco_df, ncol_cases = case_col) for idx,cols in npl_df.iterrows()]

    else:
        mco_columns = list(mco_df.columns)
        case_col = mco_columns.index("TERM_NAME")

        scores_list = [sym_score(term_name = cols[ntname], ttype ="full", mco_df=mco_df, ncol_cases=case_col) for idx,cols in npl_df.iterrows()]


    match_scores_df = DF(scores_list, columns=["TERM_NAME", "CASE_MATCH", "SET", "SORT", "TERM_ID"])

    if npl_merges:
        npl_matches = DF.merge(npl_df, match_scores_df, on = merge_columns)
    else:
        npl_matches = match_scores_df
    if min_match is not None:
        npl_matches = npl_matches[npl_matches.SET>min_match]

    return(npl_matches)