specie_annotation.py 13.2 KB
"""
USE: python3 specie_annotation.py -d ../source/features/
export CLASSPATH=$CLASSPATH:/Users/joelrodriguez/Documents/lab/stanford-corenlp-4.2.2/*:
"""


import os
import argparse
import requests
import json
import time
import sys
from progress.bar import IncrementalBar
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from matplotlib.pyplot import savefig
import joblib
#import seaborn as sns
import pandas as pd
import numpy as np
import csv
import re
from oger.ctrl.router import Router, PipelineServer

##############################################################################################

    # LINE COMMAND ARGUMENTS

parser = argparse.ArgumentParser(description='Script that takes a plain text file and return CoreNLP annotated sentences that include the mention of a specie')

parser.add_argument(
  "-f", "--file",
  help="path to text file",
  required=False)

parser.add_argument(
  "-d", "--directory",
  help="path to directory of text files",
  required=False)

parser.add_argument('-c',
                   '--autosaved',
                   required = False,
                   action='store_true')

args = parser.parse_args()

conf = Router(termlist_path='../others/7Q9E7ZQFZ208AUCVOH23BPYBUS2PTWJ_custom.tsv')
pl = PipelineServer(conf)

##############################################################################################

    # FUNCTIONS

def extract_json(file):
    with open(f"../temp/{file}", 'r') as json_file:
        data = json.load(json_file)
    extracted_file = f'../results/{file[0:-9]}_NLP.txt'    
    with open(extracted_file, 'w+') as new_file:
        sentences = []
        for sentence in data['sentences']:
            words = []
            for token in sentence['tokens']:
                corenlp = ''.join([token['word'], "|", token['lemma'], "|", token['pos']])
                words.append(corenlp)
            sentences.append(' '.join(words))
        new_file.write('\n'.join(sentences))

def del_refer(sentences_dict):
    print('Removing article references...')
    for art_name, art_sentences in sentences_dict.items():
        temp = []
        print(f'Number of sentences of {art_name}: {len(art_sentences)}')
        for sentence in art_sentences:
            x = re.match("^Reference", sentence)
            y = re.match("^REFERENCES", sentence)
            z = re.match("^References", sentence)
            if x is None and y is None and z is None:
                temp.append(sentence)
            else:
                break
        sentences_dict[art_name] = temp
        print(f'Without references: {len(temp)}')
        
    return sentences_dict

def annotate(sentences_dict, filename):
    print(f'Annotating sentences with OGER...')
    bar = IncrementalBar('Countdown', max = len([item for sublist in sentences_dict.values() for item in sublist]))
    if args.autosaved and os.path.exists('../temp/autosaved.joblib'):
        organismAnnotations = joblib.load('../temp/autosaved.joblib')
    else:
        organismAnnotations = {}
    i = 0
    for art_name, art_sentences in sentences_dict.items():
        organismAnnotations[art_name] = []
        for sentence in art_sentences:
            annPerSen = []
            try:
                #annotations = requests.post(
                #    url = 'https://pub.cl.uzh.ch/projects/ontogene/oger/upload/txt/bioc_json', data = sentence.encode('utf-8'))
                senFile = f'../temp/{art_name}_{i}.txt'
                with open(senFile, 'w') as f:
                    f.write(sentence)
                doc = pl.load_one(senFile, 'txt')
                pl.process(doc)
                jsonFile = f'../temp/{art_name}_{i}.json'
                with open(jsonFile, 'w') as f:
                    pl.write(doc, 'bioc_json', f)
                with open(jsonFile, 'r') as f:
                    annotations = json.load(f)
                os.remove(senFile)
                os.remove(jsonFile)
            except:
                joblib.dump(organismAnnotations, '../temp/autosaved.joblib')
            #annotations = annotations.json()['documents'][0]['passages'][0]['annotations']
            annotations = annotations['documents'][0]['passages'][0]['annotations']
            org_found = False
            for annotation in annotations:
                if annotation['infons']['type'] == 'organism':
                    org_found = True
                    annPerSen.append(annotation)
            if org_found:
                organismAnnotations[art_name].append(annPerSen)
            else:
                organismAnnotations[art_name].append(None)
            bar.next()
    bar.finish()
    
    return organismAnnotations

def coreNLP(texts, annDict):
    filSentences = []
    for fileName, fileText,annotations in zip(list(texts.keys()), texts.values(), annDict.values()):
        filtered = []
        for sentence, annotation in zip(fileText, annotations):
            if annotation is not None:
                filtered.append(sentence)
#        with open(f'../temp/filtered_{fileName}','w') as filtered_file:
#            filtered_file.write(''.join(filtered))
#        os.system(f'java -mx3g edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators tokenize,ssplit,pos,lemma -outputFormat json -outputDirectory "../temp/" -file "../temp/filtered_{fileName}"')
#        extract_json(f'filtered_{fileName}.json')
        filSentences.append(filtered)
    filSentences = [item for sublist in filSentences for item in sublist]  
    return filSentences

def annotations_table(sentencesText, sentencesTextExt, annDict):
    
    orgAnnotations = []
    fileSentences = [sentences for sentences in annDict.values()]
    orgAnnotationsExt = fileSentences
    for annotations in fileSentences:
        temp = []
        for annotation in annotations:
            if annotation is not None:
                temp.append(annotation) 
        orgAnnotations.append(temp)
        
    nSentencePerFile = [list(range(1, len(sentences)+1)) for sentences in orgAnnotations]
    nSentencePerFileExt = [list(range(1, len(sentences)+1)) for sentences in orgAnnotationsExt]
    
    filesList = [np.repeat(fileName, len(nSentencePerFile)) for fileName, nSentencePerFile in zip(list(annDict.keys()), nSentencePerFile)]
    filesListExt = [np.repeat(fileName, len(nSentencePerFile)) for fileName, nSentencePerFile in zip(list(annDict.keys()), nSentencePerFileExt)]
    filesList = [item for sublist in filesList for item in sublist]
    filesListExt = [item for sublist in filesListExt for item in sublist]
    
    nSentencePerFile = [item for sublist in nSentencePerFile for item in sublist]
    nSentencePerFileExt = [item for sublist in nSentencePerFileExt for item in sublist]
    
    nSentenceInFile = []
    for file, annotations in annDict.items():
        n = 0
        temp = []
        for annotation in annotations:
            n += 1
            if annotation is not None:
                temp.append(n)
        nSentenceInFile.append(temp)
        
    nSentenceInFile = [item for sublist in nSentenceInFile for item in sublist]
    
    annPerSen = [annotation for annotations in orgAnnotations for annotation in annotations]
    annPerSenExt = [annotation for annotations in orgAnnotationsExt for annotation in annotations]
    
    orgPerSen = []
    textPerSen = []
    setOrgPerSen = []
    nOrgPerSentence = []
    for annList in annPerSen:
        tempOrg = []
        tempText = []
        for ann in annList:
            tempOrg.append(ann['infons']['preferred_form'])
            tempText.append(ann['text'])
        orgPerSen.append(';'.join(tempOrg))
        uniqueOrg = list(set(tempOrg))
        nOrgPerSentence.append(len(uniqueOrg))
        setOrgPerSen.append(';'.join(uniqueOrg))
        textPerSen.append(';'.join(tempText))
        
    orgPerSenExt = []
    textPerSenExt = []
    setOrgPerSenExt = []
    nOrgPerSentenceExt = []
    for annList in annPerSenExt:
        tempOrg = []
        tempText = []
        if annList is not None:
            for ann in annList:
                tempOrg.append(ann['infons']['preferred_form'])
                tempText.append(ann['text'])
        else:
            tempOrg.append('')
            tempText.append('')
        orgPerSenExt.append(';'.join(tempOrg))
        uniqueOrg = list(set(tempOrg))
        nOrgPerSentenceExt.append(len([elm for elm in uniqueOrg if elm != '']))
        setOrgPerSenExt.append(';'.join(uniqueOrg))
        textPerSenExt.append(';'.join(tempText))

    data = {
        'SentenceNumber': list(range(1,len(filesList)+1)),
        'File': filesList,
        'SentenceNumberPerFile': nSentencePerFile,
        'SentenceNumberInFile': nSentenceInFile,
        'NumberOfOrganism': nOrgPerSentence,
        'Organisms': orgPerSen,
        'Mentions': textPerSen,
        'OrganismSet': setOrgPerSen,
        'Sentence': sentencesText
    }
    
    dataExt = {
        'SentenceNumber': list(range(1,len(filesListExt)+1)),
        'File': filesListExt,
        'SentenceNumberPerFile': nSentencePerFileExt,
        'NumberOfOrganism': nOrgPerSentenceExt,
        'Organisms': orgPerSenExt,
        'Mentions': textPerSenExt,
        'OrganismSet': setOrgPerSenExt,
        'Sentence': [sentence for sentences in sentencesTextExt.values() for sentence in sentences]
    }
    
    df = pd.DataFrame(data)
    dfExt = pd.DataFrame(dataExt)
    return df,dfExt

            

##############################################################################################



if args.file is None and args.directory is None: sys.exit('Enter a file or directory with text files.')
if args.file is not None and args.directory is not None: sys.exit('Enter only one file or directory at a time.')

print('\n****************************************************************\n')

if args.file is not None: 
    filename = args.file.split('/')[-1][0:-4]
    print(f'Reading file: {filename}.txt')
    with open(args.file, 'r') as file:
        namefile = args.file.split('/')[-1]
        sentences = {namefile : file.readlines()}
if args.directory is not None: 
    filename = args.directory.split('/')[-2]
    print(f'Reading files from directory {filename}:')
    files = os.listdir(args.directory)
    files = [file for file in files if file.endswith('.txt')]
    print('\n'.join(files))
    sentences = {}
    for file in files:
        with open(os.path.join(args.directory,file), 'r') as file_toread:
            sentences[file] = file_toread.readlines()
    #sentences = [item for sublist in sentences for item in sublist]
    
print('\n****************************************************************\n')

if os.path.exists(f'../temp/work_{filename}.joblib'):
    print('Previous work found.')
    annotations = joblib.load(f'../temp/work_{filename}.joblib')
    sentences = del_refer(sentences)
else:
    print('Starting new work.')
    sentences = del_refer(sentences)
    annotations = annotate(sentences, filename)
    joblib.dump(annotations, f'../temp/work_{filename}.joblib')

print('\n****************************************************************\n')
print('Performing tokenization with CoreNLP...')  
sentencesExt = sentences
sentences = coreNLP(sentences, annotations)    
    
print('\n****************************************************************\n')

print(f'Saving annotations in ../results/annotations_{filename}.csv')
with open(f'../results/annotations_{filename}.csv', 'w+') as annotations_file:
    annotationsDf,annotationsDfExt  = annotations_table(sentences, sentencesExt, annotations)
    annotationsDf.to_csv(annotations_file)
with open(f'../results/annotationsExt_{filename}.csv', 'w+') as annotations_file:
    annotationsDfExt.to_csv(annotations_file)
    
stats = {}
for sentence in range(0,len(annotationsDf.SentenceNumber)): 
    ann = annotationsDf.loc[sentence]
    if ann.File in stats.keys():
        orgs = ann.OrganismSet.split(';')
        stats[ann.File]['TotalMentions'] += len(ann.Mentions.split(';'))
        for org in orgs:
            if org in stats[ann.File]['CountOrgs'].keys():
                stats[ann.File]['CountOrgs'][org] += 1
            else:
                stats[ann.File]['CountOrgs'][org] = 1
    else:
        stats[ann.File] = {
            'TotalSen': len(annotations[ann.File]),
            'SenWithOrg': len([sen for sen in annotations[ann.File] if sen is not None]),
            'TotalOrgs': 0,
            'TotalMentions': len(ann.Mentions.split(';')),
            'CountOrgs': {}}

orgStats = {}
for text, value in stats.items():
    stats[text]['CountOrgs'] = dict(sorted(stats[text]['CountOrgs'].items(), key=lambda item: item[1], reverse = True))
    stats[text]['TotalOrgs'] = len(value['CountOrgs'])
    for org, count in value['CountOrgs'].items():
        if org in orgStats.keys():
            orgStats[org] += count
        else:
            orgStats[org] = count
orgStats = dict(sorted(orgStats.items(), key=lambda item: item[1], reverse = True))

print(f'Number of organisms detected: {len(orgStats)}')

statsPerText = {
    'Text' : list(stats.keys()),
    'TotalSen' : [value['TotalSen'] for value in stats.values()],
    'SenWithOrg' : [value['SenWithOrg'] for value in stats.values()],
    'TotalOrgs' : [value['TotalOrgs'] for value in stats.values()],
    'TotalMentions' : [value['TotalMentions'] for value in stats.values()],
    'Orgs(Sorted)' : [';'.join(value['CountOrgs']) for value in stats.values()]
}
with open(f'../results/stats_{filename}.csv', 'w+') as stats_file:
    pd.DataFrame(statsPerText).to_csv(stats_file)