sentence-filter_v02.py 11.4 KB
# -*- coding: UTF-8 -*-

from optparse import OptionParser
import os
import sys
from time import time
import json
import re
import pandas as pd

__author__ = 'CMendezC'


# Objective: Filter sentences with specific entities.
# Also extract attributive sentences: effect-TF
# And autoregulation: regulates its own gene
# CFMC 2022-03-08: We added updating tsv file with idsentence, sentence and section (.pre.tsv)
#   to indicate filtered sentences.

# Parameters:
#   1) --inputFileWord Path and filename to read feature word file.
#   2) --inputFileTrans Path and filename to read transformed file.
#   3) --outputPath Path to place output file.
#   4) --outputFile Output file.
#   5) --filter FILT1: (GENE OR TU) AND TF
#               FILT2: (GENE OR TU) AND EFFECT AND TF
#   6) --attrPath Path for attributive cases: ArgP-regulated genes
#   8) --dicPath Path for dictionary
#   9) --dicFile Path for dictionary file normalized_Effects.json
#   10) --autoPath Path for autoregulation cases: regulates its own gene
# /home/cmendezc/bitbucket_repositories/automatic-extraction-ris-gcs/rie-gce-system/automatic-extraction-STM-RIs-dataset/bries-bacterial-regulatory-interaction-extraction-system/autoregulation-sentences

# Output:
#   1) Filtered sentences.
#   2) Attributive sentences
#   3) Autoregulation sentences


###########################################################
#                       MAIN PROGRAM                      #
###########################################################

def getEntities(tline, filt):
    # FILT1: (GENE OR TU) AND TF
    # FILT2: (GENE OR TU) AND EFFECT AND TF
    entities = {}
    tline = tline.rstrip('\n\r ')
    for token in tline.split(" "):
        # print("Token: {}".format(token))
        listElem = token.split("|")
        w = listElem[0]
        l = listElem[1]
        t = listElem[2]
        if filt == "FILT1" or filt == "FILT2":
            if t in ["GENE", "TU", "TF", "EFFECT"]:
                if w not in entities:
                    entities[w] = t
        # if filt == "FILT2":
        #     if t in ["GENE", "TU", "TF", "EFFECT"]:
        #         if w not in entities:
        #             entities[w] = t
    return entities

if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()

    parser.add_option("--inputFileWord", dest="inputFileWord",
                      help="Path and filename to read feature word file", metavar="PATH")
    parser.add_option("--inputFileTrans", dest="inputFileTrans",
                      help="Path and filename to read transformed file", metavar="PATH")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Output path", metavar="PATH")
    parser.add_option("--outputFile", dest="outputFile",
                      help="Output file", metavar="FILE")
    parser.add_option("--filter", dest="filter", choices=('FILT1', 'FILT2'), default=None,
                      help="FILT1: (GENE OR TU) AND TF; FILT2: (GENE OR TU) AND EFFECT AND TF", metavar="TEXT")
    parser.add_option("--attrPath", dest="attrPath",
                      help="Output path attributive sentences", metavar="PATH")
    parser.add_option("--dicPath", dest="dicPath",
                      help="Output path dictionary", metavar="PATH")
    parser.add_option("--dicFile", dest="dicFile",
                      help="Output file dictionary normalized_Effects.json", metavar="FILE")
    parser.add_option("--autoPath", dest="autoPath",
                      help="Output path autoregulation sentences", metavar="PATH")
    parser.add_option("--tsvPath", dest="tsvPath",
                      help="Path to tsv file with section, id sentence, sentence. Extracted from jsonpdf.", metavar="PATH")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("None parameters indicated.")
        sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path and filename to read feature word file: " + str(options.inputFileWord))
    print("Path and filename to read transformed file: " + str(options.inputFileTrans))
    print("Output path: " + str(options.outputPath))
    print("Output file: " + str(options.outputFile))
    print("Filter: " + str(options.filter))
    print("Output path attributive sentences: " + str(options.attrPath))
    print("Output path autoregulation sentences: " + str(options.autoPath))
    print("Output path dictionary: " + str(options.dicPath))
    print("Output file dictionary normalized_Effects.json: " + str(options.dicFile))
    print("Path to tsv file with section, id sentence, sentence (Extracted from jsonpdf): " + str(options.tsvPath))

    # Loading normalized effects
    # print('Loading normalized effects...')
    hashNormalizedEffects = {}
    with open(os.path.join(options.dicPath, options.dicFile)) as diccFile:
        hashNormalizedEffects = json.load(diccFile)
    listEffects = []
    for eff in hashNormalizedEffects.keys():
        if eff.endswith('d'):
            listEffects.append(eff)
    listEffects.append("dependent")
    effects = "|".join(listEffects)
    print("Effects: {}".format(effects))

    t0 = time()
    count = 0
    hashEntities = {}
    hashAttrSent = {}
    hashAutoSent = {}
    # Original CMC 2018-11-07: reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF [^|]+\|gene')
    # We decided to extract all sentences containing effect-TF because we observed some patterns where
    # "gene" does not appear, then, to recover these examples we employ a more general rule to separate
    # attributive sentences.
    reAttrSent = re.compile(r'(' + effects + ')\|[^|]+\|TF')
    # We decided to extract all sentences containing autoregulation
    # The FimZ transcription factor activates this promoter directly ,
    #   and it also positively regulates the transcription of its own gene
    # FimZ is known to regulate the expression of its own gene positively
    # FimZ also positively regulates its own transcription
    # ArgP protein represses its own synthesis
    # ArgP both represses its own transcription
    # ArgP protein represses its own synthesis
    # OxyR|OxyR|TF is|be|VBZ also|also|RB a|a|DT regulator|regulator|EFFECT
    #   of|of|IN its|its|PRP$ own|own|JJ expression|expression|NN
    reAutoSent = re.compile(r'(?<=\|TF).+\|EFFECT.+its\|its\|PRP\$ own\|own\|JJ')
    aFilter = options.filter
    print("   Processing file...{}".format(options.inputFileTrans))
    with open(os.path.join(options.outputPath, options.outputFile), "w", encoding="utf-8", errors="replace") as oFile:
        with open(os.path.join(options.inputFileTrans), mode="r", encoding="utf-8", errors="replace") as tFile, open(os.path.join(options.inputFileWord), mode="r", encoding="utf-8", errors="replace") as wFile:
            # CFMC 2022-03-09: Load tsv file with section, id sentence, sentence (Extracted from jsonpdf)
            file = options.inputFileTrans[options.inputFileTrans.rfind("/")+1:]
            file_tsv = file.replace(".tra.txt", ".pre.tsv")
            tsv_file = pd.read_table(os.path.join(options.tsvPath, file_tsv))
            print("tsv_file.shape: {}".format(tsv_file.shape))
            tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
            print("tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
            # print(tsv_file_filtered.head(10))
            tsv_file_new = tsv_file_filtered.reset_index(drop=True)
            # print(tsv_file_new.shape)
            # print(tsv_file_new.head(10))
            i = 0
            for tLine, wLine in zip(tFile, wFile):
                # FILT1: (GENE OR TU) AND TF
                # FILT2: (GENE OR TU) AND EFFECT AND TF
                if aFilter is not None:
                    reGENETU = re.compile(r'(\|GENE|\|TU)')
                    reEFFECT = re.compile(r'\|EFFECT')
                    reTF = re.compile(r'\|TF')
                    tCount = str(count)
                    if aFilter == "FILT1":
                        if not (reGENETU.search(tLine) and reTF.search(tLine)):
                            #print("NOT FOUND")
                            # CFMC 2022-03-08
                            tsv_file_new.at[i, 'status'] = 0
                            i += 1
                            continue
                        else:
                            #print("FOUND")
                            oFile.write(wLine)
                            if tCount not in hashEntities:
                                hashEntities[tCount] = getEntities(tLine, aFilter)
                            if reAttrSent.search(tLine):
                                #print("ATTRIBUTIVE SENTENCE: {}".format(tLine))
                                if tCount not in hashAttrSent:
                                    hashAttrSent[tCount] = tLine
                            # Autoregulation sentences
                            if reAutoSent.search(tLine):
                                # print("AUOREGULATION SENTENCE: {}".format(tLine))
                                if tCount not in hashAutoSent:
                                    hashAutoSent[tCount] = tLine
                            #print(tLine)
                    elif aFilter == "FILT2":
                        if not (reGENETU.search(tLine) and reEFFECT.search(tLine) and reTF.search(tLine)):
                            continue
                            # CFMC 2022-03-08
                            tsv_file_new.at[i, 'status'] = 0
                            i += 1
                        else:
                            oFile.write(wLine)
                            if tCount not in hashEntities:
                                hashEntities[tCount] = getEntities(tLine, aFilter)
                            if reAttrSent.search(tLine):
                                if tCount not in hashAttrSent:
                                    hashAttrSent[tCount] = tLine
                            if reAutoSent.search(tLine):
                                if tCount not in hashAutoSent:
                                    hashAutoSent[tCount] = tLine
                count += 1
                i += 1

    merged = tsv_file.merge(tsv_file_new, on=['idsentence'], how='left')
    # print(merged.shape)
    # print(merged.head(10))
    tsv_file.status = merged.status_y.where(~merged.status_y.isnull(), tsv_file.status).astype(int)
    tsv_file_filtered = tsv_file[tsv_file['status'] == 1]
    print("Last tsv_file_filtered.shape: {}".format(tsv_file_filtered.shape))
    # print(tsv_file_filtered.head(10))
    tsv_file.to_csv(os.path.join(options.tsvPath, file_tsv.replace('.tsv', '.fil.tsv')), sep='\t')

    with open(os.path.join(options.outputPath, options.outputFile.replace(".txt", ".ents.json")), "w", encoding="utf-8",
              errors="replace") as eFile:
        json.dump(hashEntities, eFile)

    for f, sent in hashAttrSent.items():
        listPath = options.inputFileTrans.split('/')
        fileName = listPath[-1]
        fileName = fileName.replace('.tra.', '.att.' + f + '.')
        print("Save file {}".format(fileName))
        with open(os.path.join(options.attrPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
            aFile.write(sent)

    for f, sent in hashAutoSent.items():
        listPath = options.inputFileTrans.split('/')
        fileName = listPath[-1]
        fileName = fileName.replace('.tra.', '.auto.' + f + '.')
        print("Save file {}".format(fileName))
        with open(os.path.join(options.autoPath, fileName), "w", encoding="utf-8", errors="replace") as aFile:
            aFile.write(sent)

    print("Files split in: %fs" % (time() - t0))