ri-attributive-extraction-v02.py 20.9 KB

Raw Blame History Permalink

# -*- coding: UTF-8 -*-
from optparse import OptionParser
import sys
import os
import json
import operator
import re
from nltk.corpus import words

__author__ = 'CMendezC'


# Objective: obtain predicted ris from attributive sentences, such as ArgP-regulated gene argP
# Input format: transformed format.
# WARNING: Only one sentence per line

# Parameters:
#   1) --inputPath Input path
#   2) --inputFile Inpupt file
#   3) --outputPath Output path
#   5) --diccPath Dictionary path
#   7) --diccEffect File with normalized effects

#   6) --diccFile JSON file with entity dictionaries
#   9) --diccEPAth Dictionary path diccEffect
#   8) --format Output format: standoff, tabs

# Ouput:
#   1) File with predicted ris combined with existing files.
# Format standoff:
# T1	TF 0 0	ArgP-regulated
# T2	GENE 0 0	argP
# T1      Growth_condition 88 137 mitochondrial electron transport chain inhibitors
# R1	Interaction.activator Target:T3 Agent:T1

# Execution
# C:\anaconda3\python ri-attributive-extraction.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences
# --inputFile ris-sentences-analysis.att.017.txt
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs
# --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources
# --diccEffect normalized_Effects.json
# C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json
# C:\anaconda3\python ri-attributive-extraction.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\attributive-sentences --inputFile ris-sentences-analysis.att.286.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\bitbucket_automatic-extraction-ris-gcs\rie-gce-system\automatic-extraction-ris-gcs\predicted-ris-gcs --diccPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --diccEffect normalized_Effects.json

# python3 ri-attributive-extraction.py
# --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences
# --inputFile ris-sentences-analysis.att.017.txt
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs
# --diccPath /home/cmendezc/terminologicalResources
# --diccEffect normalized_Effects.json
# python3 ri-attributive-extraction.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/attributive-sentences --inputFile ris-sentences-analysis.att.017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-ris-gcs/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

def getPosWord(wordPos, endPos, text, termList):
    offsetStart = 0
    wordNum = 0
    listText = text.split()
    for w in listText:
        # if filenameBefore.find('000-2') > -1:
        #     print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
        if wordNum >= int(wordPos):
            # for tok in word.split():
            for t in termList:
                # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
                if w == t:
                    return [w, offsetStart, offsetStart + len(w) - 1]
            #else:
        wordNum += 1
        offsetStart += len(w) + 1
        if wordNum > int(endPos):
            return None
    return None

def getIdEntity(aList, etype, idE):
    entity = aList[0]
    if etype == "EFFECT":
        normalizedEffect = entity
        #print("EFFECT: {}".format(entity))
        if entity in hashNormalizedEffects:
            normalizedEffect = hashNormalizedEffects[entity]
        etype += "." + normalizedEffect
        #print("etype: {}".format(etype))
    entityPosStart = aList[1]
    entityPosEnd = aList[2]
    keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
    #print("keyEntity: {}".format(keyEntity))
    if keyEntity not in hashEntities:
        idE += 1
        idEntity = "T{}".format(idE)
        hashEntities[keyEntity] = idEntity
        #print("New entity {}: {}".format(idEntity, keyEntity))
        return idEntity, idE
    else:
        idEntity = hashEntities[keyEntity]
        return idEntity, idE

def getIdInteraction(regulator, regulated, effect, idI, hashInt):
    #print("hashInt: {}".format(hashInt))
    keyInteraction = "{} {} {}".format(regulator, regulated, effect)
    if keyInteraction not in hashInt:
        idI += 1
        idInteraction = "R{}".format(idI)
        hashInt[keyInteraction] = idInteraction
        #print("New interaction {}: {}".format(idInteraction, keyInteraction))
        #return idInteraction, idI
    else:
        idInteraction = hashInt[keyInteraction]
    return idInteraction, idI

def saveFiles(filename, hashE, hashI, s, effect):
    if effect:
        outputPath = os.path.join(options.outputPath, "complete-ris")
    else:
        outputPath = os.path.join(options.outputPath, "incomplete-ris")
    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File:
    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
        for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
            aList = k.split()
            a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File:
    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
        for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
            aList = k.split()
            a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile:
        txtFile.write(s)

def loadFileEntities(filename, outputPath, hashTemp):
    #print("Start loadFileEntities")
    idE = 1
    try:
        f = filename[:filename.rfind(".")] + ".a1"
        # print("file entities: {}".format(f))
        with open(os.path.join(outputPath, f), mode="r") as a1File:
            for line in a1File:
                line = line.strip('\n')
                listLine1 = line.split('\t')
                listLine2 = listLine1[1].split(' ')
                etype = listLine2[0]
                entityPosStart = listLine2[1]
                entityPosEnd = listLine2[2]
                entity = listLine1[2]
                keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
                idEntity = listLine1[0]
                if keyEntity not in hashTemp:
                    hashTemp[keyEntity] = idEntity
                    if int(idEntity[1:]) > idE:
                        idE = int(idEntity[1:])
    except IOError:
        print("IOError file: {}".format(os.path.join(outputPath, f)))
        # idE = 1
    return idE

def loadFileInteractions(filename, outputPath, hashTemp):
    #print("Start loadFileInteractions")
    idI = 1
    try:
        with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File:
            for line in a2File:
                #print("Line a2: {}".format(line))
                line = line.strip('\n')
                listLine1 = line.split('\t')
                listLine2 = listLine1[1].split(' ')
                regulator = listLine2[2]
                regulator = regulator[regulator.find(":") + 1:]
                regulated = listLine2[1]
                regulated = regulated[regulated.find(":") + 1:]
                effect = listLine2[0]
                effect = effect[effect.find(".") + 1:]
                idInteraction = listLine1[0]
                keyInteraction = "{} {} {}".format(regulator, regulated, effect)
                if keyInteraction not in hashTemp:
                    hashTemp[keyInteraction] = idInteraction
                    if int(idInteraction[1:]) > idI:
                        idI = int(idInteraction[1:])
    except IOError:
        print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2")))
        # idI = 1
    return idI

def getRealPos(posStart, posEnd, lin):
    return (posStart, posEnd)

def getRI(r, l):
    regulator = r.group('regulator')
    regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l)
    # regulatorStart = getRealPos(r.start('regulator'), l)
    # regulatorEnd = getRealPos(r.end('regulator'), l)
    regulated = r.group('regulated')
    regulatedPos = getRealPos(r.start('regulated'), r.end('regulated'), l)
    # regulatedStart = getRealPos(r.start('regulated'), l)
    # regulatedEnd = getRealPos(r.end('regulated'), l)
    effect = r.group('effect')
    effectPos = getRealPos(r.start('effect'), r.end('effect'), l)
    # effectStart = getRealPos(r.start('effect'), l)
    # effectEnd = getRealPos(r.end('effect'), l)
    #print("Regulator {}, start {}, end {}".format(regulator, regulatorPos[0], regulatorPos[1]))
    #print("Regulated {}, start {}, end {}".format(regulated, regulatedPos[0], regulatedPos[1]))
    #print("Effect {}, start {}, end {}".format(effect, effectPos[0], effectPos[1]))
    return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
                    regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
                    effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]

if __name__ == "__main__":
    # Parameter definition
    # python3 $SCRIPT_PATH/ri-attributive-extraction-v02.py
    # --inputPath $(dirname ${file})
    # --inputFile $(basename ${file})
    # --outputPath $OUTPUT_PATH
    # --diccPath $DICC_PATH
    # --diccEffect normalized_Effects.json
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Input path", metavar="PATH")
    parser.add_option("--inputFile", dest="inputFile",
                      help="Input file", metavar="FILE")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Output path", metavar="PATH")
    parser.add_option("--diccPath", dest="diccPath",
                      help="Path to read dictionaries", metavar="PATH")
    # parser.add_option("--diccFile", dest="diccFile",
    #                   help="JSON file with entity dictionaries", metavar="FILE")
    parser.add_option("--diccEffect", dest="diccEffect",
                      help="File with normalized effects", metavar="FILE")

    # parser.add_option("--format", dest="format",
    #                   help="Output format: standoff", metavar="TEXT")
    # parser.add_option("--diccEPAth", dest="diccEPAth",
    #                   help="File with normalized effects", metavar="FILE")

    (options, args) = parser.parse_args()
    #if len(args) > 0:
    #    parser.error("None parameter entered.")
    #    sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Input path: " + str(options.inputPath))
    print("Input file: " + str(options.inputFile))
    print("Output path: " + str(options.outputPath))
    print("Path to read dictionaries: " + str(options.diccPath))
    # print("JSON file with entity dictionaries: " + str(options.diccFile))
    print("File with normalized effects: " + str(options.diccEffect))
    # print("Output format: " + str(options.format))
    # print("Path to read normalized effects: " + str(options.diccEPAth))

    # regularWords =  words.words('en')

    # print('Loading dictionaries...')
    # with open(os.path.join(options.diccPath, options.diccFile)) as diccFile:
    #    hashDicc = json.load(diccFile)

    # hashTermFiles = hashDicc["hashTermFiles"]
    # hashTerms = hashDicc["hashTerms"]

    # for key in hashTermFiles.keys():
    #     for f in hashTermFiles[key]:
    #         # print('File: ' + f)
    #         with open(os.path.join(options.diccPath, f), "r", encoding="utf-8", errors="replace") as iFile:
    #             for line in iFile:
    #                 line = line.strip('\n')
    #                 line = line.replace(' ', '-')
    #                 if line not in hashTerms[key]:
    #                     hashTerms[key].append(line)
    #                     # if options.termLower:
    #                     # hashTerms[key].append(line.lower())
    #                     # if options.termCapitalize:
    #                     # hashTerms[key].append(line.capitalize())
    #     print('   Terms read {} size: {}'.format(key, len(hashTerms[key])))

    # Loading normalized effects
    print('Loading normalized effects ending with -d...')
    hashNormalizedEffects = {}
    with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile:
        hashNormalizedEffects = json.load(diccFile)
    listEffects = []
    for eff in hashNormalizedEffects.keys():
        if eff.endswith('d'):
            listEffects.append(eff)
    listEffects.append("dependent")
    effects = "|".join(listEffects)
    #print("Effects: {}".format(effects))

    files = {}
    hashEntities = {}
    hashInteractions = {}
    hashInteractionsEffect = {}
    idEntities = 1
    idInteractions = 1
    idInteractionsEffect = 1

    # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)\s([^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
    # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+)\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+( [^ ]+)')
    # regexAttRILeft = re.compile(r'((?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))\s([^|]+\|[^|]+\|(CC|,))?)+ ([^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
    # regexAttRILeft = re.compile(r'(?:([^|\s]+\|[^|]+\|(?:GENE|TU))\s(?:[^|]+\|[^|]+\|(CC|,))?)+ (?:[^ ]+ ){1,3}(?P<regulator>[^|]+)\|[^|]+\|TF')
    # regexAttRILeft = re.compile(r'(?=([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)')
    # regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU))(\s[^|]+\|[^|]+\|(CC|,))?)')
    # regexAttRILeft = re.compile(r'(?P<regulated>([^|\s]+\|[^|]+\|(GENE|TU)(\s[^|]+\|[^|]+\|(CC|,))?)+) ([^ ]+ )+(?P<regulator>[^|]+\|[^|]+\|TF)')
    # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>' + r'(' + effects + ')\|[^|]+\|TF) [^|]+\|gene')

    # reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
    # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(regulated|repressed)\|[^|]+\|TF) [^|]+\|gene')
    # regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ ){,5}(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene')
    # CMC 2018-11-07: regexAttRILeft = re.compile(r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene')
    regexAttRILeft = re.compile(
        r'(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU)) ([^ ]+ )+(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF)')
    # regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ ){,5}(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
    # CMC 2018-11-07: regexAttRIRight = re.compile(r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) [^|]+\|gene\|[^\s]+ ([^ ]+ )+(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')
    regexAttRIRight = re.compile(
        r'(?P<regulator>[^|\s]+(?P<effect>(' + effects + '))\|[^|]+\|TF) ([^ ]+ )*(?P<regulated>[^|\s]+\|[^|]+\|(GENE|TU))')

    filename = options.inputFile
    hashEntities = {}
    hashInteractions = {}
    hashInteractionsEffect = {}
    idEntities = 1
    idInteractions = 1
    idInteractionsEffect = 1
    outputPath = os.path.join(options.outputPath, "complete-ris")
    idEntities = loadFileEntities(filename, outputPath, hashEntities)
    idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
    outputPath = os.path.join(options.outputPath, "incomplete-ris")
    idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)

    listRIs = []

    with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
        for line in iFile:
            line = line.rstrip('\n')
            # Buscar hacia la izquierda
            #print("Buscando hacia <<")
            result = regexAttRILeft.search(line)
            #print("result: {}".format(result))
            lineTemp = line
            # print("lineTemp: {}".format(lineTemp))
            while result:
                #print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect')))
                listRIs.append(getRI(result, line))
                #print("listRIs: {}".format(listRIs))
                lineTemp = lineTemp.replace(result.group('regulated'), '')
                #print("lineTemp for: {}".format(lineTemp))
                result = regexAttRILeft.search(lineTemp)
                #print("result: {}".format(result))

            # Buscar hacia la derecha
            #print("Buscando hacia >>")
            result = regexAttRIRight.search(line)
            #print("result: {}".format(result))
            lineTemp = line
            # print("lineTemp: {}".format(lineTemp))
            while result:
                #print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulated'), result.group('effect')))
                listRIs.append(getRI(result, line))
                #print("listRIs: {}".format(listRIs))
                lineTemp = lineTemp.replace(result.group('regulated'), '')
                #print("lineTemp for: {}".format(lineTemp))
                result = regexAttRIRight.search(lineTemp)
                #print("result: {}".format(result))

            # result = regexAttRIRight.finditer(line)
            # lineTemp = line
            # while result:
            #     listRIs.append(getRI(result, line))
            #     lineTemp = lineTemp.replace(result.group('regulated'), '')
            #     result = regexAttRIRight.finditer(lineTemp)

    # return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
    #                 regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
    #                 effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]
    for ri in listRIs:
        #print("ri: {}".format(ri))
        if len(ri) != 4:
            print("WARNING! corrupted list")
            exit()
        regulator = ri[0]
        regulated = ri[1]
        effect = ri[2]
        line = ri[3]

        listElem = regulator.split('|')
        regulatorWord = listElem[0]
        regulatorType = listElem[2]
        regulatorStart = listElem[3]
        regulatorEnd = listElem[4]

        listElem = regulated.split('|')
        regulatedWord = listElem[0]
        regulatedType = listElem[2]
        regulatedStart = listElem[3]
        regulatedEnd = listElem[4]

        listElem = effect.split('|')
        effectWord = listElem[0]
        effectType = "EFFECT"
        effectStart = listElem[1]
        effectEnd = listElem[2]

        idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities)
        if regulatedType == "GENE":
            idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities)
        elif regulatedType == "TU":
            idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "TU", idEntities)
        else:
            print("WARNING! Unknown entity type")
        idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator",
                                                         idInteractions, hashInteractions)
        idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities)
        idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect,
                                                               idInteractionsEffect,
                                                               hashInteractionsEffect)

        saveFiles(filename, hashEntities, hashInteractions, line, effect=False)
        saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True)