get-TRN-Organism-v1.py 12.4 KB
# -*- coding: UTF-8 -*-
import operator
from optparse import OptionParser
import os
import sys
import json
import re
import pandas as pd

__author__ = 'CMendezC'


# Objective: add organism annotation (http://pakal.ccg.unam.mx/cmendezc/bacteria-annotation) to TRN tabla

# Parameters:
#   1) --trnPath Path to TRN detail table
#   2) --trnFile File of TRN detail table
#   3) --outputPath Output path
#   4) --organismPath Path to Organism annotation table
#   5) --organismFile File of Organism annotation table

# Ouput:
#   1) Tsv file detail with:
# TF	TypeRegulated	Regulated	Effect	PMID    IdSentence  TypeSentence    Sentence
#   Original_idsentence  Original_sentence  SectionNum SectionName  OrganismMentions    OrganismScore    ConfirmationLevel
# OrganismScore = {
#   If only salmonella or only non identified organism  = 1,
#   If (startswith salmonella or non identified organism) and other organisms = 0.5
#   If only other organisms = 0
#   }

# Execution:
# python3.4 get-TRN-Organism-v1.py

# Local
# python get-TRN-Organism-v1.py
# --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
# --trnFile STMTRN_all.detail.tsv
# --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results"
# --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results
# --organismFile annotations_STMTRN_all.sentences.csv
# python3 get-TRN-Organism-v1.py --trnPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --trnFile STMTRN_all.detail.tsv --outputPath "/home/cmendezc/Dropbox (UNAM-CCG)/PGC_CO/Proyectos/02.Proyectos_vigentes/NLP/salmonella_bioinfo_nlp_cm/3.Desarrollo/text-mining/results" --organismPath /home/cmendezc/Documents/ccg/gitlab-bacteria-annotation/results --organismFile annotations_STMTRN_all.sentences.csv

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

def only_salmonella_or_non_identified_organism(list_temp):
    non_identified_organisms = [
        'unidentified plasmid',
        'unidentified',
        'bacterium',
        'bacterium IFAM-3211',
        'bacterium IFAM-2074',
        'bacterium IFAM-1493',
        'bacterium IFAM-3215',
        'bacterium IFAM-3359',
        'hybrid',
        'Vector pMC1403',
        'Transposon Tn10',
        'unidentified cloning vector',
        'Plasmid F',
        'Cloning vector pUC19'
    ]
    matches = 0
    for o in list_temp:
        if o.lower().startswith("salmonella") or o in non_identified_organisms:
            matches += 1
    if matches == len(list_temp):
        return True
    else:
        return False

def salmonella_or_non_identified_and_other_organisms(list_temp):
    non_identified_organisms = [
        'unidentified plasmid',
        'unidentified',
        'bacterium',
        'bacterium IFAM-3211',
        'bacterium IFAM-2074',
        'bacterium IFAM-1493',
        'bacterium IFAM-3215',
        'bacterium IFAM-3359',
        'hybrid',
        'Vector pMC1403',
        'Transposon Tn10',
        'unidentified cloning vector',
        'Plasmid F',
        'Cloning vector pUC19'
    ]
    matches = 0
    for o in list_temp:
        if o.lower().startswith("salmonella") or o in non_identified_organisms:
            matches += 1
    if matches < len(list_temp) and matches > 0:
        return True
    else:
        return False

def only_other_organims(list_temp):
    non_identified_organisms = [
        'unidentified plasmid',
        'unidentified',
        'bacterium',
        'bacterium IFAM-3211',
        'bacterium IFAM-2074',
        'bacterium IFAM-1493',
        'bacterium IFAM-3215',
        'bacterium IFAM-3359',
        'hybrid',
        'Vector pMC1403',
        'Transposon Tn10',
        'unidentified cloning vector',
        'Plasmid F',
        'Cloning vector pUC19'
    ]
    matches = 0
    for o in list_temp:
        if o.lower().startswith("salmonella") or o in non_identified_organisms:
            matches += 1
    if matches == 0:
        return True
    else:
        return False

if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()
    parser.add_option("--trnPath", dest="trnPath",
                      help="Path to TRN detail table", metavar="PATH")
    parser.add_option("--trnFile", dest="trnFile",
                      help="File of TRN detail table", metavar="FILE")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Output path", metavar="PATH")
    parser.add_option("--organismPath", dest="organismPath",
                      help="Path to organism annotation table", metavar="PATH")
    parser.add_option("--organismFile", dest="organismFile",
                      help="File of organism annotation table", metavar="FILE")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("None parameter entered.")
        sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to TRN detail table: " + str(options.trnPath))
    print("File of TRN detail table: " + str(options.trnFile))
    print("Output path: " + str(options.outputPath))
    print("Path to organism annotation table: " + str(options.organismPath))
    print("File of organism annotation table: " + str(options.organismFile))

    # Load organism annotation table
    print("Loading organism annotation table")
    df_organisms = pd.read_csv(os.path.join(options.organismPath, options.organismFile), sep=',')
    print("Total de frases anotadas con organism: {}".format(df_organisms.shape[0]))

    # Load TRN detail table
    print("Loading TRN detail table")
    df_detail = pd.read_csv(os.path.join(options.trnPath, options.trnFile), sep='\t')
    print("Total de frases en TRN: {}".format(df_detail.shape[0]))

    # Fix column for organism. We changed this issue in get-TRN-v2.py
    df_detail = df_detail.rename(columns={"Organism": "Organisms"})
    df_detail['OrganismScore'] = 1.00
    print(df_detail.columns)
    #print(df_detail['Sentence'].head(15))

    for idx in df_organisms.index:
        organisms = df_organisms['Organisms'][idx]
        SentenceNumberInFile = df_organisms['SentenceNumberInFile'][idx]
        SentenceNumberInFile = SentenceNumberInFile - 2
        # print("Organisms before: {}".format(df_detail.Organisms[SentenceNumberInFile]))
        df_detail.Organisms[SentenceNumberInFile] = organisms
        # print("Organisms assigned: {}".format(df_detail.Organisms[SentenceNumberInFile]))

        # OrganismScore = {
        #   If only salmonella or only non identified organism  = 1,
        #   If (startswith salmonella or non identified organism) and other organisms = 0.5
        #   If only other organisms = 0
        #   }
        list_organisms = organisms.split(';')
        # print("     OrganismScore before: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))
        if only_salmonella_or_non_identified_organism(list_organisms):
            df_detail.OrganismScore[SentenceNumberInFile] = 1.00
        elif salmonella_or_non_identified_and_other_organisms(list_organisms):
            df_detail.OrganismScore[SentenceNumberInFile] = 0.50
        elif only_other_organims(list_organisms):
            df_detail.OrganismScore[SentenceNumberInFile] = 0.00
        # print("     OrganismScore assigned: {}".format(df_detail.OrganismScore[SentenceNumberInFile]))

    hashPredictedRIs = {}
    hashPredictedRIsCount = {}
    hashPredictedRIsCountVer = {}
    hashPredictedRIsCountDev = {}
    hashPredictedRIsCountAtt = {}
    hashPredictedRIsCountAuto = {}
    hashPredictedRIsScore = {}
    hashPredictedRIsRI = {}
    for idx in df_detail.index:
        tf = df_detail['TF'][idx]
        TypeRegulated = df_detail['TypeRegulated'][idx]
        Regulated = df_detail['Regulated'][idx]
        Effect = df_detail['Effect'][idx]
        pmid = df_detail['PMID'][idx]
        numsent = df_detail['NumSentence'][idx]
        type_sent = df_detail['TypeSentence'][idx]
        sentence = df_detail['Sentence'][idx]
        original_idsentence = df_detail['OriginalIdSentence'][idx]
        original_sentence = df_detail['OriginalSentence'][idx]
        section_num = df_detail['SectionNum'][idx]
        section_name = df_detail['SectionName'][idx]
        organisms = df_detail['Organisms'][idx]
        organism_score = df_detail['OrganismScore'][idx]
        llave = "{}\t{}\t{}\t{}".format(tf, TypeRegulated, Regulated, Effect)
        if organism_score == 0:
            continue
        if llave in hashPredictedRIs:
            hashPredictedRIs[llave].append(
                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
                                                            original_sentence, section_num, section_name, organisms,
                                                            organism_score, "", "", "", "", "", ""))
            hashPredictedRIsCount[llave] += 1
            if type_sent == "ver/dev":
                hashPredictedRIsCountVer[llave] += 1
            elif type_sent == "dev":
                hashPredictedRIsCountDev[llave] += 1
            elif type_sent == "att":
                hashPredictedRIsCountAtt[llave] += 1
            elif type_sent == "auto":
                hashPredictedRIsCountAuto[llave] += 1
            # if organism_score == 0.5:
                # We penalize RI
                # hashPredictedRIsScore[llave] -= 0.05

        else:
            hashPredictedRIs[llave] = [
                "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(pmid, numsent, type_sent, sentence, original_idsentence,
                                                            original_sentence, section_num, section_name, organisms,
                                                            organism_score, "", "", "", "", "", "")]
            hashPredictedRIsCount[llave] = 1
            hashPredictedRIsCountVer[llave] = 0
            hashPredictedRIsCountDev[llave] = 0
            hashPredictedRIsCountAtt[llave] = 0
            hashPredictedRIsCountAuto[llave] = 0
            hashPredictedRIsScore[llave] = 1
            if type_sent == "ver/dev":
                hashPredictedRIsCountVer[llave] = 1
            elif type_sent == "dev":
                hashPredictedRIsCountDev[llave] = 1
            elif type_sent == "att":
                hashPredictedRIsCountAtt[llave] = 1
            elif type_sent == "auto":
                hashPredictedRIsCountAuto[llave] = 1
            # if organism_score == 0.5:
                # We penalize RI
                # hashPredictedRIsScore[llave] -= 0.05

    print("Total RIs en TRN con organismo: {}".format(len(hashPredictedRIs)))
    with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "summary_org")), mode="w") as oFile:
        # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
        oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tAtt\tAuto\tScore\tRI\n")
        for k,v in hashPredictedRIs.items():
            RI_value = "True"
            # if hashPredictedRIsScore[k] < 1:
                # RI_value = "Possible"
            oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(k, hashPredictedRIsCount[k], hashPredictedRIsCountVer[k],
                                                              hashPredictedRIsCountAtt[k], hashPredictedRIsCountAuto[k],
                                                              hashPredictedRIsScore[k], RI_value))
    with open(os.path.join(options.outputPath, options.trnFile.replace("detail", "detail_org")), mode="w") as oFile:
        # oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tSentCount\tVer/Dev\tDev\tAtt\tAuto\tSentences\n")
        oFile.write("TF\tTypeRegulated\tRegulated\tEffect\tPMID\tNumSentence\tTypeSentence\tSentence\tOriginalIdSentence\tOriginalSentence\tSectionNum\tSectionName\tOrganisms\tOrganismScore\tKT\tCL\tSource\tSpeculation\tNegation\tConfirmationLevel\n")
        i = 0
        for k,v in hashPredictedRIs.items():
            for s in v:
                oFile.write("{}\t{}\n".format(k, s))
                i += 1
    print("Total de frases en TRN organismo: {}".format(i))