ri-openie-extraction-v02.py 21.2 KB

Raw Blame History Permalink

# -*- coding: UTF-8 -*-
from optparse import OptionParser
import sys
import os
import json
import operator
from general_functions import getTypeRegulation
import re
from nltk.corpus import words

__author__ = 'CMendezC'


# Objective: obtain predicted ris from triplets extracted by OpenIE Stanford CoreNLP
# Input format:
# WARNING: Only one sentence per line

# Parameters:
#   1) --inputPath Input path
#   2) --inputFile Inpupt file
#   3) --outputPath Output path
#   5) --diccPath Dictionary path
#   6) --diccFile JSON file with entity dictionaries
#   7) --diccEffect File with normalized effects
#   8) --format Output format: standoff, tabs
#   9) --diccEPAth Dictionary path diccEffect

# Ouput:
#   1) File with predicted ris.
# Format standoff:
# T1	TF 0 0	MetR
# T2	TU 0 0	metH
# T3	GENE 0 0	metH
# T1      Growth_condition 88 137 mitochondrial electron transport chain inhibitors
# T2      Growth_condition 150 179        switch rich to minimal medium
# R1	Interaction.activator Target:T3 Agent:T1
# R2	Interaction.activator Target:T2 Agent:T1

# Execution
# python3.4 ri-openie-extraction.py
# --inputFile /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/predicted-ris/predicted-ris.reverb
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/processing-ris
# --diccPath /home/cmendezc/terminologicalResources
# --diccFile normalized_Effects_Type.json
# --diccEffect termFilesTag_RIE_GCE_SYSTEM_ECCO.jsong
# --format standoff

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

def getPosWord(wordPos, endPos, text, termList, type_entity=""):
    #print("GETPOSWORD wordPOs {}".format(wordPos))
    offsetStart = 0
    wordNum = 0
    listText = text.split()
    for w in listText:
        # if filenameBefore.find('000-2') > -1:
        #     print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
        if wordNum >= int(wordPos):
            # for tok in word.split():
            for t in termList:
                # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
                if w == t:
                    if type_entity == "EFFECT":
                        # To change regulation effect in:
                        # negative regulator --> repressor
                        # positively regulates --> activator
                        print("text: {}".format(text))
                        new_w = getTypeRegulation(w, int(wordPos), text, "word")
                        return [new_w, offsetStart, offsetStart + len(w) - 1]
                    else:
                        return [w, offsetStart, offsetStart + len(w) - 1]
            #else:
        wordNum += 1
        offsetStart += len(w) + 1
        if wordNum > int(endPos):
            return None
    return None


def getIdEntity(aList, etype, idE):
    entity = aList[0]
    if etype == "EFFECT":
        normalizedEffect = entity
        # print("EFFECT: {}".format(entity))
        if entity in hashEffects:
            normalizedEffect = hashEffects[entity]
        etype += "." + normalizedEffect
            # print("EFFECT: {}".format(entity))
    entityPosStart = aList[1]
    entityPosEnd = aList[2]
    keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
    #if filenameBefore.find('061-02') > -1:
    #    print("keyEntity: {}".format(keyEntity))
    #    print("idE: {}".format(idE))
    #    print("hashEntities: {}".format(hashEntities))
    if keyEntity not in hashEntities:
        idE += 1
        idEntity = "T{}".format(idE)
        #if filenameBefore.find('061-02') > -1:
        #    print("idEntity not in hashEntities: {}".format(keyEntity))
        #    print("idE not in hashEntities: {}".format(idE))
        hashEntities[keyEntity] = idEntity
        #print("New entity {}: {}".format(idEntity, keyEntity))
        return idEntity, idE
    else:
        idEntity = hashEntities[keyEntity]
        return idEntity, idE


def getIdInteraction(regulator, regulated, effect, idI, hashInt):
    #print("hashInt: {}".format(hashInt))
    keyInteraction = "{} {} {}".format(regulator, regulated, effect)
    if keyInteraction not in hashInt:
        idI += 1
        idInteraction = "R{}".format(idI)
        hashInt[keyInteraction] = idInteraction
        #print("New interaction {}: {}".format(idInteraction, keyInteraction))
        #return idInteraction, idI
    else:
        idInteraction = hashInt[keyInteraction]
    return idInteraction, idI


def saveFiles(filename, hashE, hashI, s, effect):
    if effect:
        outputPath = os.path.join(options.outputPath, "complete-ris")
    else:
        outputPath = os.path.join(options.outputPath, "incomplete-ris")
    with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="w") as a1File:
    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
        for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
            aList = k.split()
            a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
    with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="w") as a2File:
    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
        for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
            aList = k.split()
            a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
    with open(os.path.join(outputPath, filename[:file.find(".")] + ".txt"), mode="w") as txtFile:
        txtFile.write(s)

def loadFileEntities(filename, outputPath, hashTemp):
    idE = 1
    try:
        with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="r") as a1File:
            for line in a1File:
                line = line.strip('\n')
                listLine1 = line.split('\t')
                listLine2 = listLine1[1].split(' ')
                etype = listLine2[0]
                entityPosStart = listLine2[1]
                entityPosEnd = listLine2[2]
                entity = listLine1[2]
                keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
                idEntity = listLine1[0]
                if keyEntity not in hashTemp:
                    hashTemp[keyEntity] = idEntity
                    if int(idEntity[1:]) > idE:
                        idE = int(idEntity[1:])
    except IOError:
        print("IOError file, idEntity starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a1")))
        # idE = 1
    return idE

def loadFileInteractions(filename, outputPath, hashTemp):
    idI = 1
    try:
        with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="r") as a2File:
            for line in a2File:
                #print("Line a2: {}".format(line))
                line = line.strip('\n')
                listLine1 = line.split('\t')
                listLine2 = listLine1[1].split(' ')
                regulator = listLine2[2]
                regulator = regulator[regulator.find(":") + 1:]
                regulated = listLine2[1]
                regulated = regulated[regulated.find(":") + 1:]
                effect = listLine2[0]
                effect = effect[effect.find(".") + 1:]
                idInteraction = listLine1[0]
                keyInteraction = "{} {} {}".format(regulator, regulated, effect)
                if keyInteraction not in hashTemp:
                    hashTemp[keyInteraction] = idInteraction
                    if int(idInteraction[1:]) > idI:
                        idI = int(idInteraction[1:])
    except IOError:
        print("IOError file, idInteraction starts in 1: {}".format(os.path.join(outputPath, filename[:file.find(".")] + ".a2")))
        # idI = 1
    return idI

if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Input path", metavar="PATH")
    parser.add_option("--inputFile", dest="inputFile",
                      help="Input file", metavar="FILE")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Output path", metavar="PATH")
    #parser.add_option("--outputFile", dest="outputFile",
                      #help="Output file", metavar="FILE")
    parser.add_option("--diccPath", dest="diccPath",
                      help="Path to read dictionaries", metavar="PATH")
    parser.add_option("--diccFile", dest="diccFile",
                      help="JSON file with entity dictionaries", metavar="FILE")
    parser.add_option("--diccEffect", dest="diccEffect",
                      help="File with normalized effects", metavar="FILE")
    parser.add_option("--format", dest="format",
                      help="Output format: standoff", metavar="TEXT")
    parser.add_option("--diccEPAth", dest="diccEPAth",
                      help="File with normalized effects", metavar="FILE")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("None parameter entered.")
        sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Input path: " + str(options.inputPath))
    print("Input file: " + str(options.inputFile))
    print("Output path: " + str(options.outputPath))
    #print("Output file: " + str(options.outputFile))
    print("Path to read dictionaries: " + str(options.diccPath))
    print("JSON file with entity dictionaries: " + str(options.diccFile))
    print("Path to read normalized effects: " + str(options.diccEPAth))
    print("File with normalized effects: " + str(options.diccEffect))
    print("Output format: " + str(options.format))

    regularWords =  words.words('en')

    print('Loading dictionaries...')
    with open(os.path.join(options.diccPath, options.diccFile)) as diccFile:
        hashDicc = json.load(diccFile)

    # hashTermFiles = hashDicc["hashTermFiles"]
    # hashTerms = hashDicc["hashTerms"]

    # for key in hashTermFiles.keys():
    #     for f in hashTermFiles[key]:
    #         # print('File: ' + f)
    #         with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile:
    #             for line in iFile:
    #                 line = line.strip('\n')
    #                 line = line.replace(' ', '-')
    #                 if line not in hashTerms[key]:
    #                     hashTerms[key].append(line)
    #                     # if options.termLower:
    #                     # hashTerms[key].append(line.lower())
    #                     # if options.termCapitalize:
    #                     # hashTerms[key].append(line.capitalize())
    #     print('   Terms read {} size: {}'.format(key, len(hashTerms[key])))

    # Loading normalized effects
    print('Loading normalized effects...')
    with open(os.path.join(options.diccEPAth, options.diccEffect)) as diccFile:
        hashEffects = json.load(diccFile)

    files = {}
    hashEntities = {}
    hashInteractions = {}
    hashInteractionsEffect = {}
    idEntities = 1
    idInteractions = 1
    idInteractionsEffect = 1
    filenameBefore = ''
    regexNumFile = re.compile(r'_([0-9]+)[.-]')
    numFile = ""
    inumFile = 0
    hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []}

    with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
        for line in iFile:
            line = line.rstrip('\n')
            listLine = line.split('\t')
            file = listLine[0]
            filename = file.split("/")[-1]
            filename = filename[:-4]
            if filename not in files:
                # New file, that is, new sentence
                files[filename] = 1
                if len(files) > 1:
                    if len(hashEntities) > 0:
                        #if filenameBefore.find('061-02') > -1:
                        #    print("filenameBefore: {}".format(filenameBefore))
                        #    print("Save hashEntities: {}".format(hashEntities))
                        #    print("Save hashInteractions: {}".format(hashInteractions))
                        #    print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect))
                        saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False)
                        saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True)
                filenameBefore = filename
                hashEntities = {}
                hashInteractions = {}
                hashInteractionsEffect = {}
                idEntities = 1
                idInteractions = 1
                idInteractionsEffect = 1
                outputPath = os.path.join(options.outputPath, "complete-ris")
                idEntities = loadFileEntities(filename, outputPath, hashEntities)
                idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
                outputPath = os.path.join(options.outputPath, "incomplete-ris")
                idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)
                result = regexNumFile.search(filenameBefore)
                if result:
                    inumFile = int(result.group(1))
                    numFile = str(inumFile)
                    print("Numfile: {}".format(numFile))
                else:
                    print("WARNING: numfile not found in filename")
                hashTerms = {"TF": [], "TU": [], "EFFECT": [], "GENE": []}
                if numFile in hashDicc:
                    hashTemp = hashDicc[numFile]
                    #print("hashDicc[numFile]: {}".format(hashTemp))
                    for k, v in hashTemp.items():
                        if v == "TF":
                            # print("Verifiying TF")
                            if "TF" in hashTerms:
                                # print(" TF {}".format(k))
                                hashTerms["TF"].append(k)
                            else:
                                hashTerms["TF"] = [k]
                        elif v == "GENE":
                            if "GENE" in hashTerms:
                                hashTerms["GENE"].append(k)
                            else:
                                hashTerms["GENE"] = [k]
                        elif v == "TU":
                            if "TU" in hashTerms:
                                hashTerms["TU"].append(k)
                            else:
                                hashTerms["TU"] = [k]
                        elif v == "EFFECT":
                            if "EFFECT" in hashTerms:
                                hashTerms["EFFECT"].append(k)
                            else:
                                hashTerms["EFFECT"] = [k]
                        else:
                            print("WARNING: entity not found in dictionaries")
                else:
                    print("WARNING: numfile not found in dictionaries")
                #if filename.find('061-02') > -1:
                #    print("filename: {}".format(filename))
                #    print("Load hashEntities: {}".format(hashEntities))
                #    print("Load hashInteractions: {}".format(hashInteractions))
                #    print("Load hashInteractionsEffect: {}".format(hashInteractionsEffect))

            wordA = listLine[2]
            wordB = listLine[3]
            wordC = listLine[4]
            startA = listLine[5]
            endA = listLine[6]
            startB = listLine[7]
            endB = listLine[8]
            startC = listLine[9]
            endC = listLine[10]
            sent = listLine[12]
            lemmaA = listLine[2]
            lemmaB = listLine[3]
            lemmaC = listLine[4]

            # Return [tok, offsetStart, offsetEnd ]
            # print("hashTerms[TF]: {}".format(hashTerms["TF"]))
            listRegulator = getPosWord(startA, endA, sent, hashTerms["TF"])
            if listRegulator is not None:
                #if filenameBefore.find('061-02') > -1:
                #    print(">> Regulator found: {}".format(listRegulator[0]))
                listRegulated = getPosWord(startC, endC, sent, hashTerms["GENE"])
                if listRegulated is not None:
                    #if filenameBefore.find('061-02') > -1:
                    #    print(">> Regulated GENE found: {}".format(listRegulated[0]))
                    idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
                    idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities)
                    idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
                    #print("Review EFFECT")
                    listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
                    if listEffect is not None:
                        idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
                        idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
                else:
                    listRegulated = getPosWord(startC, endC, sent, hashTerms["TU"])
                    if listRegulated is not None:
                        #if filenameBefore.find('061-02') > -1:
                        #    print(">> Regulated TU found: {}".format(listRegulated[0]))
                        idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
                        idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities)
                        idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
                        #print("Review EFFECT")
                        listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
                        if listEffect is not None:
                            idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
                            idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
            else:
                listRegulator = getPosWord(startC, endC, sent, hashTerms["TF"])
                if listRegulator is not None:
                    #if filenameBefore.find('061-02') > -1:
                    #    print(">> Regulator found: {}".format(listRegulator[0]))
                    listRegulated = getPosWord(startA, endA, sent, hashTerms["GENE"])
                    if listRegulated is not None:
                        #if filenameBefore.find('061-02') > -1:
                        #    print(">> Regulated GENE found: {}".format(listRegulated[0]))
                        idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
                        idRegulated, idEntities = getIdEntity(listRegulated, "GENE", idEntities)
                        idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
                        #print("Review EFFECT")
                        listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
                        if listEffect is not None:
                            idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
                            idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
                    else:
                        listRegulated = getPosWord(startA, endA, sent, hashTerms["TU"])
                        if listRegulated is not None:
                            #if filenameBefore.find('061-02') > -1:
                            #    print(">> Regulated TU found: {}".format(listRegulated[0]))
                            idRegulator, idEntities = getIdEntity(listRegulator, "TF", idEntities)
                            idRegulated, idEntities = getIdEntity(listRegulated, "TU", idEntities)
                            idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator", idInteractions, hashInteractions)
                            #print("Review EFFECT")
                            listEffect = getPosWord(startB, endB, sent, hashTerms["EFFECT"], "EFFECT")
                            if listEffect is not None:
                                idEffect, idEntities = getIdEntity(listEffect, "EFFECT", idEntities)
                                idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect, idInteractionsEffect, hashInteractionsEffect)
        if len(files) > 1:
            if len(hashEntities) > 0:
                #print("filenameBefore: {}".format(filenameBefore))
                #print("Save hashEntities: {}".format(hashEntities))
                #print("Save hashInteractions: {}".format(hashInteractions))
                #print("Save hashInteractionsEffect: {}".format(hashInteractionsEffect))
                saveFiles(filenameBefore, hashEntities, hashInteractions, sent, effect=False)
                saveFiles(filenameBefore, hashEntities, hashInteractionsEffect, sent, effect=True)