ri-autoregulation-extraction-v01.py 16.7 KB

Raw Blame History Permalink

# -*- coding: UTF-8 -*-
from optparse import OptionParser
import sys
import os
import json
import operator
import re
from general_functions import getTypeRegulation
from nltk.corpus import words

__author__ = 'CMendezC'


# Objective: obtain predicted ris from autoregulation sentences,
# such as ArgP protein represses its own synthesis
# Input format: transformed format.
# WARNING: Only one sentence per line

# Parameters:
#   1) --inputPath Input path
#   2) --inputFile Inpupt file
#   3) --outputPath Output path
#   5) --diccPath Dictionary path
#   7) --diccEffect File with normalized effects

#   6) --diccFile JSON file with entity dictionaries
#   9) --diccEPAth Dictionary path diccEffect
#   8) --format Output format: standoff, tabs

# Ouput:
#   1) File with predicted ris combined with existing files.
# Format standoff:
# T1	TF 0 0	ArgP
# T2	GENE 0 0	Argp -- > argP
# R1	Interaction.activator Target:T3 Agent:T1
# Sentence ArgP protein represses its own synthesis
# The FimZ transcription factor activates this promoter directly ,
#   and it also positively regulates the transcription of its own gene
# FimZ is known to regulate the expression of its own gene positively
# FimZ also positively regulates its own transcription
# ArgP protein represses its own synthesis
# ArgP both represses its own transcription
# ArgP protein represses its own synthesis
# OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
#   of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN

# Execution
# python3 ri-autoregulation-extraction-v01.py
# --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences
# --inputFile dataSet_OnlyRI_sentences.auto.1017.txt
# --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs
# --diccPath /home/cmendezc/terminologicalResources
# --diccEffect normalized_Effects.json
# python3 ri-autoregulation-extraction-v01.py --inputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences --inputFile dataSet_OnlyRI_sentences.auto.1017.txt --outputPath /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/predicted-ris-gcs --diccPath /home/cmendezc/terminologicalResources --diccEffect normalized_Effects.json

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

def getPosWord(wordPos, endPos, text, termList):
    offsetStart = 0
    wordNum = 0
    listText = text.split()
    for w in listText:
        # if filenameBefore.find('000-2') > -1:
        #     print("Word {} in wordNum {} with wordPos {}".format(w, wordNum, wordPos))
        if wordNum >= int(wordPos):
            # for tok in word.split():
            for t in termList:
                # For entities starting word: if w == t or (w.startswith(t) and w not in regularWords):
                if w == t:
                    return [w, offsetStart, offsetStart + len(w) - 1]
            #else:
        wordNum += 1
        offsetStart += len(w) + 1
        if wordNum > int(endPos):
            return None
    return None

def getIdEntity(aList, etype, idE):
    entity = aList[0]
    if etype == "EFFECT":
        normalizedEffect = entity
        #print("EFFECT: {}".format(entity))
        if entity in hashNormalizedEffects:
            normalizedEffect = hashNormalizedEffects[entity]
        etype += "." + normalizedEffect
        #print("etype: {}".format(etype))
    entityPosStart = aList[1]
    entityPosEnd = aList[2]
    keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
    #print("keyEntity: {}".format(keyEntity))
    if keyEntity not in hashEntities:
        idE += 1
        idEntity = "T{}".format(idE)
        hashEntities[keyEntity] = idEntity
        #print("New entity {}: {}".format(idEntity, keyEntity))
        return idEntity, idE
    else:
        idEntity = hashEntities[keyEntity]
        return idEntity, idE

def getIdInteraction(regulator, regulated, effect, idI, hashInt):
    #print("hashInt: {}".format(hashInt))
    keyInteraction = "{} {} {}".format(regulator, regulated, effect)
    if keyInteraction not in hashInt:
        idI += 1
        idInteraction = "R{}".format(idI)
        hashInt[keyInteraction] = idInteraction
        #print("New interaction {}: {}".format(idInteraction, keyInteraction))
        #return idInteraction, idI
    else:
        idInteraction = hashInt[keyInteraction]
    return idInteraction, idI

def saveFiles(filename, hashE, hashI, s, effect):
    if effect:
        outputPath = os.path.join(options.outputPath, "complete-ris")
    else:
        outputPath = os.path.join(options.outputPath, "incomplete-ris")
    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a1"), mode="w") as a1File:
    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a1"), mode="a+") as a1File:
        for k, v in sorted(hashE.items(), key=operator.itemgetter(1)):
            aList = k.split()
            a1File.write("{}\t{} {} {}\t{}\n".format(v, aList[0], aList[1], aList[2], aList[3]))
    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="w") as a2File:
    #with open(os.path.join(outputPath, filename[:file.find(".")] + ".a2"), mode="a+") as a2File:
        for k, v in sorted(hashI.items(), key=operator.itemgetter(1)):
            aList = k.split()
            a2File.write("{}\tInteraction.{} Target:{} Agent:{}\n".format(v, aList[2], aList[1], aList[0]))
    with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".txt"), mode="w") as txtFile:
        txtFile.write(s)

def loadFileEntities(filename, outputPath, hashTemp):
    #print("Start loadFileEntities")
    idE = 1
    try:
        f = filename[:filename.rfind(".")] + ".a1"
        # print("file entities: {}".format(f))
        with open(os.path.join(outputPath, f), mode="r") as a1File:
            for line in a1File:
                line = line.strip('\n')
                listLine1 = line.split('\t')
                listLine2 = listLine1[1].split(' ')
                etype = listLine2[0]
                entityPosStart = listLine2[1]
                entityPosEnd = listLine2[2]
                entity = listLine1[2]
                keyEntity = "{} {} {} {}".format(etype, entityPosStart, entityPosEnd, entity)
                idEntity = listLine1[0]
                if keyEntity not in hashTemp:
                    hashTemp[keyEntity] = idEntity
                    if int(idEntity[1:]) > idE:
                        idE = int(idEntity[1:])
    except IOError:
        print("IOError file: {}".format(os.path.join(outputPath, f)))
        # idE = 1
    return idE

def loadFileInteractions(filename, outputPath, hashTemp):
    #print("Start loadFileInteractions")
    idI = 1
    try:
        with open(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2"), mode="r") as a2File:
            for line in a2File:
                #print("Line a2: {}".format(line))
                line = line.strip('\n')
                listLine1 = line.split('\t')
                listLine2 = listLine1[1].split(' ')
                regulator = listLine2[2]
                regulator = regulator[regulator.find(":") + 1:]
                regulated = listLine2[1]
                regulated = regulated[regulated.find(":") + 1:]
                effect = listLine2[0]
                effect = effect[effect.find(".") + 1:]
                idInteraction = listLine1[0]
                keyInteraction = "{} {} {}".format(regulator, regulated, effect)
                if keyInteraction not in hashTemp:
                    hashTemp[keyInteraction] = idInteraction
                    if int(idInteraction[1:]) > idI:
                        idI = int(idInteraction[1:])
    except IOError:
        print("IOError file: {}".format(os.path.join(outputPath, filename[:filename.rfind(".")] + ".a2")))
        # idI = 1
    return idI

'''
def getTypeRegulation(effect_group, posini, sent, type_sent):
    # To change regulation effect in such as:
    # negative regulator --> repressor
    # positively regulates --> activator
    effect_ret = effect_group
    #listEff = effect_ret.split('|')

    if type_sent == "tra":
        regexTypeEffectPosi = re.compile(r'(?<=positive\|(RB|JJ) )' + effect_ret)
        regexTypeEffectNega = re.compile(r'(?<=negative\|(RB|JJ) )' + effect_ret)
        if regexTypeEffectPosi.search(sent, posini - 12):
            # Creo que no es necesario: effect_ret = "activator|{}|{}".format(listEff[1], listEff[2])
            effect_ret = "activator"
            print("Change regulation effect: {}".format(sent))
        elif regexTypeEffectNega.search(sent, posini - 12):
            # Creo que no es necesario: effect_ret = "repressor|{}|{}".format(listEff[1], listEff[2])
            effect_ret = "repressor"
            print("Change regulation effect: {}".format(sent))
    return effect_ret
'''

def getRealPos(posStart, posEnd, lin):
    return (posStart, posEnd)

def getRI(r, l):
    regulator = r.group('regulator')
    regulatorPos = getRealPos(r.start('regulator'), r.end('regulator'), l)
    # We change TF name to GENE name
    listRegulator = regulator.split('|')
    regulatorWord = listRegulator[0]
    regulated = regulatorWord[0].lower()+regulatorWord[1:]
    regulated += "|{}|GENE".format(regulated)
    regulatedPos = getRealPos(0, 0, l)
    effect = r.group('effect')
    # print("effect from group: {}".format(effect))
    effectPos = getRealPos(r.start('effect'), r.end('effect'), l)

    # To change regulation effect in:
    # negative regulator --> repressor
    # positively regulates --> activator
    effect = getTypeRegulation(effect, r.start('effect'), l, "tra")

    return [regulator + '|' + str(regulatorPos[0]) + '|' + str(regulatorPos[1]),
                    regulated + '|' + str(regulatedPos[0]) + '|' + str(regulatedPos[1]),
                    effect + '|' + str(effectPos[0]) + '|' + str(effectPos[1]), l]

if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Input path", metavar="PATH")
    parser.add_option("--inputFile", dest="inputFile",
                      help="Input file", metavar="FILE")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Output path", metavar="PATH")
    parser.add_option("--diccPath", dest="diccPath",
                      help="Path to read dictionaries", metavar="PATH")
    parser.add_option("--diccEffect", dest="diccEffect",
                      help="File with normalized effects", metavar="FILE")

    (options, args) = parser.parse_args()
    #if len(args) > 0:
    #    parser.error("None parameter entered.")
    #    sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Input path: " + str(options.inputPath))
    print("Input file: " + str(options.inputFile))
    print("Output path: " + str(options.outputPath))
    print("Path to read dictionaries: " + str(options.diccPath))
    print("File with normalized effects: " + str(options.diccEffect))

    # Loading normalized effects
    print('Loading normalized effects (all)...')
    hashNormalizedEffects = {}
    with open(os.path.join(options.diccPath, options.diccEffect)) as diccFile:
        hashNormalizedEffects = json.load(diccFile)
    listEffects = []
    for eff in hashNormalizedEffects.keys():
        listEffects.append(eff)
    effects = "|".join(listEffects)
    #print("Effects: {}".format(effects))

    files = {}
    hashEntities = {}
    hashInteractions = {}
    hashInteractionsEffect = {}
    idEntities = 1
    idInteractions = 1
    idInteractionsEffect = 1

    # The FimZ transcription factor activates this promoter directly ,
    #   and it also positively regulates the transcription of its own gene
    # FimZ is known to regulate the expression of its own gene positively
    # FimZ also positively regulates its own transcription
    # ArgP protein represses its own synthesis
    # ArgP both represses its own transcription
    # ArgP protein represses its own synthesis
    # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
    #   of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
    regexAutoRI = re.compile(
        # r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]\s){,4}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
        r'(?P<regulator>[^|\s]+\|[^|]+\|TF).+\s(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
        #r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^(TF)\s]+\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT\s([^\s]+\s){,5}its\|its\|PRP\$ own\|own\|JJ (gene|transcription|synthesis|expression)')
        #r'(?P<regulator>[^|\s]+\|[^|]+\|TF)\s([^|\s]+\|[^|]+\|[^T][^F]\s)+(?P<effect>(' + effects + '))\|[^|]+\|EFFECT')

    filename = options.inputFile
    hashEntities = {}
    hashInteractions = {}
    hashInteractionsEffect = {}
    idEntities = 1
    idInteractions = 1
    idInteractionsEffect = 1
    outputPath = os.path.join(options.outputPath, "complete-ris")
    idEntities = loadFileEntities(filename, outputPath, hashEntities)
    idInteractionsEffect = loadFileInteractions(filename, outputPath, hashInteractionsEffect)
    outputPath = os.path.join(options.outputPath, "incomplete-ris")
    idInteractions = loadFileInteractions(filename, outputPath, hashInteractions)

    listRIs = []
    # print("Read autoregulation file")
    with open(os.path.join(options.inputPath, options.inputFile)) as iFile:
        for line in iFile:
            line = line.rstrip('\n')
            print("Buscando autoregulation")
            result = regexAutoRI.search(line)
            #print("result: {}".format(result))
            if result:
                lineTemp = result.string[result.end('regulator'):result.end(0)]
                # print("lineTemp: {}".format(lineTemp))
                result2 = regexAutoRI.search(lineTemp)
                if result2:
                    print("Regulator {} regulated {} effect {}".format(result2.group('regulator'), result2.group('regulator'), result2.group('effect')))
                    listRIs.append(getRI(result2, line))
                    print("listRIs: {}".format(listRIs))
                elif result:
                    print("Regulator {} regulated {} effect {}".format(result.group('regulator'), result.group('regulator'), result.group('effect')))
                    listRIs.append(getRI(result, line))
                    print("listRIs: {}".format(listRIs))


    for ri in listRIs:
        #print("ri: {}".format(ri))
        if len(ri) != 4:
            print("WARNING! corrupted list")
            exit()
        regulator = ri[0]
        regulated = ri[1]
        effect = ri[2]
        line = ri[3]

        listElem = regulator.split('|')
        regulatorWord = listElem[0]
        regulatorType = listElem[2]
        regulatorStart = listElem[3]
        regulatorEnd = listElem[4]

        listElem = regulated.split('|')
        regulatedWord = listElem[0]
        regulatedType = listElem[2]
        regulatedStart = listElem[3]
        regulatedEnd = listElem[4]

        listElem = effect.split('|')
        effectWord = listElem[0]
        effectType = "EFFECT"
        effectStart = listElem[1]
        effectEnd = listElem[2]

        idRegulator, idEntities = getIdEntity([regulatorWord, regulatorStart, regulatorEnd], "TF", idEntities)
        idRegulated, idEntities = getIdEntity([regulatedWord, regulatedStart, regulatedEnd], "GENE", idEntities)
        idInteraction, idInteractions = getIdInteraction(idRegulator, idRegulated, "regulator",
                                                         idInteractions, hashInteractions)
        idEffect, idEntities = getIdEntity([effectWord, effectStart, effectEnd], "EFFECT", idEntities)
        idInteraction, idInteractionsEffect = getIdInteraction(idRegulator, idRegulated, idEffect,
                                                               idInteractionsEffect,
                                                               hashInteractionsEffect)

        saveFiles(filename, hashEntities, hashInteractions, line, effect=False)
        saveFiles(filename, hashEntities, hashInteractionsEffect, line, effect=True)