Carlos-Francisco Méndez-Cruz

Setting up project

<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$" vcs="Git" />
</component>
</project>
\ No newline at end of file
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
import re
__author__ = 'CMendezC'
# Objective: Take text-annotated-abstracts-original.txt as input
# for obtaining abstracts separated in files without tags and collecting dictionary of genes
# for tagging after NLP pipeline.
# Parameters:
# 1) --inputPath Input path.
# 2) --inputFile Input file.
# 3) --outputPath Output path
# Execution:
#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Input path", metavar="PATH")
parser.add_option("--inputFile", dest="inputFile",
help="Input file", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Output path", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Input path: " + str(options.inputPath))
print("Input file", str(options.inputFile))
print("Output path: " + str(options.outputPath))
filesWritten = 0
t0 = time()
hashGenes = {}
rePmid = re.compile(r'([\d])+\|a\|')
reGene = re.compile(r'<g>([^<]+)</g>')
with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
print("Reading file..." + options.inputFile)
for line in iFile:
line = line.strip('\n')
for gene in reGene.findall(line):
print("genes: {}".format(gene))
result = rePmid.match(line)
if result:
with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
oFile.write(line)
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
from time import time
import json
from nltk.corpus import stopwords
__author__ = 'CMendezC'
# Objective: Take transformed file with format word|lemma|tag,
# for example: Multiple|multiple|JJ genetic|genetic|JJ variants|variant|NNS have|have|VBP
# and create file with an additional tagging for CRF training. For example:
# the|the|dt N-terminal|N-terminal|NN| domain|domain|NN -->
# the|the|dt|O N-terminal|N-terminal|DPOS domain|NN|O
# Additionally, we are going to aTag frequent words belonging to each aspect with corresponding aspect aTag (DOM or RP).
# We expect that these words are going to have one aTag in some context and different one in others.
# The frequent words were obtained by another program (corpusAnalysis) and save into files that are loaded here.
# In output file we only maintain the lemma and the tag or the word and the tag.
# This additional tagging is going to give us clues for aspect classification.
# Parameters:
# 1) --inputPath Path to read files.
# 2) --trainingFile File name with training data.
# 3) --testFile File name with test data.
# 4) --outputPath Path to write files. File names are concatenated with feature name.
# 5) ELIMINATED --feature Type of feature to extract and create file: lemma
# 6) --termPath Path to read term files
# 7) --termFiles JSON file with terms files and tags
# 8) --termPath Path to read JSON file with information about frequent words files
# 9) --inputFileFreq JSON file with information about frequent words
# 10 --skip=N Skip N words to form skip mentions
# 11) --stopWords Filtering stop words
# 12) --filterPunctMarks Filtering punctuation marks
# Ouput:
# 1) Files created. Name of feature is concatenated
# Execution:
# ASPECTS
# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\aspects_TrainingTest_RP_DOM_20160723\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# SENTENCES
# python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\corpus\aspectClassificationDatasets\sentences_TrainingTest_RP_DOM_20160725\CRF_trainingTest_Datasets --feature lemma,word --termPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\resources\termLists --termFiles termFilesTag.json
# none: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json
# stopwords: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords
# stopwords AND filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --stopWords --filterPunctMarks
# filterPunctMarks: python preparingTrainingTestDatasets_v1.0.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\transformed --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --trainingClassesFile classesTraining.txt --testClassesFile classesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF\trainingTest_Datasets\aspectTagged --termPath C:\Users\cmendezc\Documents\GENOMICAS\terminologicalResources --termFiles termFilesTag_TFSummarization.json --inputFileFreq freqWords_Aspect.json --filterPunctMarks
# ¿? --SKIP
def getSkipMentions(aList, aSkip):
hashTemp = {}
for j in range(0, aSkip):
listTemp = []
for i in range(0, len(aList), aSkip+1):
listTemp.append(aList[i + j])
hashTemp[j] = listTemp
return hashTemp
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read files", metavar="PATH")
parser.add_option("--trainingFile", dest="trainingFile",
help="File with training examples", metavar="FILE")
parser.add_option("--testFile", dest="testFile",
help="File with test examples", metavar="FILE")
parser.add_option("--trainingClassesFile", dest="trainingClassesFile",
help="File with training classes", metavar="FILE")
parser.add_option("--testClassesFile", dest="testClassesFile",
help="File with test classes", metavar="FILE")
parser.add_option("--outputPath", dest="outputPath",
help="Path to write output file, feature parameter is concatenated to file name.", metavar="PATH")
parser.add_option("--termPath", dest="termPath",
help="Path to read term files", metavar="PATH")
parser.add_option("--termFiles", dest="termFiles",
help="JSON file with terms files and tags", metavar="PATH")
parser.add_option("--inputFileFreq", dest="inputFileFreq",
help="JSON file with information about frequent words", metavar="PATH")
parser.add_option("--skip", type="int",
dest="skip", default=0,
help="Skip mentions", metavar="N")
parser.add_option("--filterStopWords", default=False,
action="store_true", dest="filterStopWords",
help="Filtering stop words")
parser.add_option("--filterPunctMarks", default=False,
action="store_true", dest="filterPunctMarks",
help="Filtering punctuation marks")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameters indicated.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read files: " + str(options.inputPath))
print("File with training examples", str(options.trainingFile))
print("File with test examples", str(options.testFile))
print("File with training classes", str(options.trainingClassesFile))
print("File with test classes", str(options.testClassesFile))
print("File with training classes", str(options.trainingClassesFile))
print("File with test classes", str(options.testClassesFile))
print("Path to write output files: " + str(options.outputPath))
print("JSON file with information about frequent words: " + str(options.inputFileFreq))
print("Skip mentions: " + str(options.skip))
print("Filtering stop words: " + str(options.stopWords))
punctMarks = ['.', ',', ':', ';', '?', '!', '\'', '"']
print("Filtering puntuation marks " + str(punctMarks) + ': '+ str(options.filterPunctMarks))
filesRead = 0
t0 = time()
print('Loading biological term files...')
with open(os.path.join(options.termPath, options.termFiles)) as data_file:
hashes = json.load(data_file)
print(' Loading biological term files... done')
hashTagAspect = hashes["hashTagAspect"]
print('Loading frequent words...')
with open(os.path.join(options.termPath, options.inputFileFreq)) as data_file:
hashAspectFreqWords = json.load(data_file)
print(' Loading frequent words... done')
listFiles = [options.trainingFile, options.testFile]
listClassesFiles = [options.trainingClassesFile, options.testClassesFile]
for iFile, cFile in zip(listFiles, listClassesFiles):
with open(os.path.join(options.inputPath, iFile), "r", encoding="utf-8", errors="replace") as tFile:
print("Reading file..." + iFile)
lines = [l.strip('\n') for l in tFile.readlines()]
filesRead += 1
with open(os.path.join(options.inputPath, cFile), "r", encoding="utf-8", errors="replace") as clFile:
print("Reading file..." + cFile)
classes = [c.strip('\n') for c in clFile.readlines()]
listLines = []
print("Processing files... ")
for line, c in zip(lines, classes):
# print("class: ", c)
listTokenLine = []
# listLemmaLine = []
for tok in line.split():
tokList = tok.split("|")
word = tokList[0]
lemma = tokList[1]
tag = tokList[2]
# Filtering stopwords
if options.stopWords:
if lemma in stopwords.words('english'):
continue
if options.filterPunctMarks:
if lemma in punctMarks:
continue
# if tag in hashTagAspect:
# We change tag for aspect tag only in the case of aspect tag coincide with class.
# We want that CRF learn when to change term tag to aspect tag in correct context
if tag in hashTagAspect:
if hashTagAspect[tag] == c:
aTag = hashTagAspect[tag]
else:
aTag = 'O'
else:
if c in hashAspectFreqWords:
# print("class: ", c)
hashFreqWords = hashAspectFreqWords[c]
# We verify if word or lemma is in frequent words.
# These frequent words are word-forms (tokens)
if word.lower() in hashFreqWords or lemma in hashFreqWords:
aTag = c
else:
aTag = 'O'
else:
aTag = 'O'
listTokenLine.append(word + "|" + lemma + "|" + tag + "|" + aTag)
# if feature == "word":
listLines.append(listTokenLine)
# if feature == "lemma":
# listLines = listLemmaLine.strip() + '\n'
if options.skip > 0:
t0 = time()
skipTemp = options.skip
for i in range(1, options.skip):
hashTemp = getSkipMentions(listLines, skipTemp)
# skipTemp -= 1
for key in hashTemp:
listLines = hashTemp[key]
with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(skipTemp) + '.txt')), "w",
encoding="utf-8") as oFile:
for line in listLines:
oFile.write(line)
print("Skip mention done in: %fs" % (time() - t0))
else:
with open(os.path.join(options.outputPath, iFile.replace('.txt', '.StopWords_' + str(
options.stopWords) + '.FilterPunctMarks_' + str(options.filterPunctMarks) + '.Skip_' + str(options.skip) + '.txt')), "w",
encoding="utf-8") as oFile:
for line in listLines:
for token in line:
oFile.write(token + ' ')
oFile.write('\n')
print("Files processed: " + str(filesRead))
\ No newline at end of file
# -*- coding: UTF-8 -*-
import os
from itertools import chain
from optparse import OptionParser
from time import time
from collections import Counter
import nltk
import sklearn
import scipy.stats
import sys
from sklearn.externals import joblib
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.corpus import stopwords
from trainingTesting_Sklearn_crfsuite import word2features
from trainingTesting_Sklearn_crfsuite import sent2features
# from trainingTesting_Sklearn_crfsuite import hasNonAlphaNum
# from trainingTesting_Sklearn_crfsuite import hasDigit
# Objective
# Tagging transformed file with CRF model with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH Path of transformed files x|y|z
# --modelPath Path to CRF model
# --modelName Model name
# --outputPath=PATH Output path to place output files
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# Output
# 1) Tagged files in transformed format
# Examples
# Sentences
# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName aspectsTraining.fStopWords_False.fSymbols_True --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged --filterSymbols > output.taggingCRF.20161107.txt
# C:\Anaconda2\python tagging_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed --modelName sentencesTraining.fStopWords_False.fSymbols_False --modelPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\classifying_TFSentences\corpus\ECK120011394_FhlA\transformed_CRFtagged > output.taggingCRF.20161107.txt
#################################
# FUNCTIONS #
#################################
# def hasDigit(text):
# has = False
# if len(text) < 3:
# return False
# myRegex = nltk.re.compile('[0-9]')
# if myRegex.search(text) != None:
# has = True
# return has
#
#
# def hasNonAlphaNum(text):
# has = False
# if len(text) < 3:
# return False
# myRegex = nltk.re.compile('\W')
# if myRegex.search(text) != None:
# has = True
# return has
# IMPORTED FROM TRAINING SCRIPT
# def word2features(sent, i):
# # print "i: " + str(i)
# # print "sent[i]" + sent[i]
# listElem = sent[i].split('|')
# word = listElem[0]
# lemma = listElem[1]
# postag = listElem[2]
#
# features = {
# # Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
# # Suffixes
# 'word[-3:]': word[-3:],
# 'word[-2:]': word[-2:],
# 'word[-1:]': word[-1:],
# 'word.isupper()': word.isupper(),
# 'word.istitle()': word.istitle(),
# 'word.hasDigit()': hasDigit(word),
# 'word.hasNonAlphaNum': hasNonAlphaNum(word),
# # 'word.isdigit()': word.isdigit(),
# 'word': word,
# 'lemma': lemma,
# 'lemma[-3:]': lemma[-3:],
# 'lemma[-2:]': lemma[-2:],
# 'lemma[-1:]': lemma[-1:],
# 'postag': postag,
# # Prefixes
# 'postag[:2]': postag[:2],
# 'postag[:1]': postag[:1],
# }
# if i > 0:
# listElem = sent[i - 1].split('|')
# word1 = listElem[0]
# lemma1 = listElem[1]
# postag1 = listElem[2]
# features.update({
# '-1:word.lower()': word1.lower(),
# '-1:word.istitle()': word1.istitle(),
# '-1:word.isupper()': word1.isupper(),
# '-1:word.hasDigit()': hasDigit(word1),
# '-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
# '-1:word': word1,
# '-1:lemma': lemma1,
# '-1:postag': postag1,
# '-1:postag[:2]': postag1[:2],
# '-1:postag[:1]': postag1[:1],
# })
# # else:
# # features['BOS'] = True
#
# if i < len(sent) - 1:
# listElem = sent[i + 1].split('|')
# word1 = listElem[0]
# lemma1 = listElem[1]
# postag1 = listElem[2]
# features.update({
# '+1:word.lower()': word1.lower(),
# '+1:word.istitle()': word1.istitle(),
# '+1:word.isupper()': word1.isupper(),
# '+1:word.hasDigit()': hasDigit(word1),
# '+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
# '+1:word': word1,
# '+1:lemma': lemma1,
# '+1:postag': postag1,
# '+1:postag[:2]': postag1[:2],
# '+1:postag[:1]': postag1[:1],
# })
# # else:
# # features['EOS'] = True
# if i > 1:
# listElem = sent[i - 2].split('|')
# word2 = listElem[0]
# lemma2 = listElem[1]
# postag2 = listElem[2]
# features.update({
# '-2:word.lower()': word2.lower(),
# '-2:word.istitle()': word2.istitle(),
# '-2:word.isupper()': word2.isupper(),
# '-2:word.hasDigit()': hasDigit(word2),
# '-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
# '-2:word': word2,
# '-2:lemma': lemma2,
# '-2:postag': postag2,
# '-2:postag[:2]': postag2[:2],
# '-2:postag[:1]': postag2[:1],
# })
#
# if i < len(sent) - 2:
# listElem = sent[i + 2].split('|')
# word2 = listElem[0]
# lemma2 = listElem[1]
# postag2 = listElem[2]
# features.update({
# '+2:word.lower()': word2.lower(),
# '+2:word.istitle()': word2.istitle(),
# '+2:word.isupper()': word2.isupper(),
# '+2:word.hasDigit()': hasDigit(word2),
# '+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
# '+2:word': word2,
# '+2:lemma': lemma2,
# '+2:postag': postag2,
# '+2:postag[:2]': postag2[:2],
# '+2:postag[:1]': postag2[:1],
# })
#
# trigrams = False
# if trigrams:
# if i > 2:
# listElem = sent[i - 3].split('|')
# word3 = listElem[0]
# lemma3 = listElem[1]
# postag3 = listElem[2]
# features.update({
# '-3:word.lower()': word3.lower(),
# '-3:word.istitle()': word3.istitle(),
# '-3:word.isupper()': word3.isupper(),
# '-3:word.hasDigit()': hasDigit(word3),
# '-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
# '-3:word': word3,
# '-3:lemma': lemma3,
# '-3:postag': postag3,
# '-3:postag[:2]': postag3[:2],
# '-3:postag[:1]': postag3[:1],
# })
#
# if i < len(sent) - 3:
# listElem = sent[i + 3].split('|')
# word3 = listElem[0]
# lemma3 = listElem[1]
# postag3 = listElem[2]
# features.update({
# '+3:word.lower()': word3.lower(),
# '+3:word.istitle()': word3.istitle(),
# '+3:word.isupper()': word3.isupper(),
# '+3:word.hasDigit()': hasDigit(word3),
# '+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
# '+3:word': word3,
# '+3:lemma': lemma3,
# '+3:postag': postag3,
# '+3:postag[:2]': postag3[:2],
# '+3:postag[:1]': postag3[:1],
# })
#
# return features
# def sent2features(sent):
# return [word2features(sent, i) for i in range(len(sent))]
__author__ = 'CMendezC'
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path to place output files",
metavar="PATH")
parser.add_option("--modelPath", dest="modelPath",
help="Path to read CRF model",
metavar="PATH")
parser.add_option("--modelName", dest="modelName",
help="Model name", metavar="TEXT")
parser.add_option("--filterStopWords", default=False,
action="store_true", dest="filterStopWords",
help="Filtering stop words")
parser.add_option("--filterSymbols", default=False,
action="store_true", dest="filterSymbols",
help="Filtering punctuation marks")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + options.inputPath)
print("Mode name: " + str(options.modelName))
print("Model path: " + options.modelPath)
print("Path to place output files: " + options.outputPath)
print("Filtering stop words: " + str(options.filterStopWords))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
# symbols = [sym.decode('utf-8') for sym in ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
# '}', '[', ']', '*', '%', '$', '#', '&', '°']]
# symbols = [u'.', u',', u':', u';', u'?', u'!', u'\'', u'"', u'<', u'>', u'(', u')', u'-', u'_', u'/', u'\\', u'¿', u'¡', u'+', u'{',
# u'}', u'[', u']', u'*', u'%', u'$', u'#', u'&', u'°', u'`']
print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
print('-------------------------------- PROCESSING --------------------------------')
stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
# Read CRF model
t0 = time()
print('Reading CRF model...')
crf = joblib.load(os.path.join(options.modelPath, 'models', options.modelName + '.mod'))
print("Reading CRF model done in: %fs" % (time() - t0))
print('Processing corpus...')
t0 = time()
labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
# For each file in dir
for file in files:
print(" Preprocessing file..." + str(file))
sentencesInputData = []
sentencesOutputData = []
with open(os.path.join(options.inputPath, file), "r") as iFile:
lines = iFile.readlines()
for line in lines:
listLine = []
# line = line.decode("utf-8")
for token in line.strip('\n').split():
if options.filterStopWords:
listToken = token.split('|')
lemma = listToken[1]
# Original if lemma in stopwords.words('english'):
if lemma in stopwords:
continue
if options.filterSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
if lemma == ',':
print "Coma , identificada"
continue
listLine.append(token)
sentencesInputData.append(listLine)
print " Sentences input data: " + str(len(sentencesInputData))
# print sentencesInputData[0]
# print(sent2features(sentencesInputData[0])[0])
# print(sent2labels(sentencesInputData[0]))
X_input = [sent2features(s) for s in sentencesInputData]
print(sent2features(sentencesInputData[0])[0])
# y_test = [sent2labels(s) for s in sentencesInputData]
# Predicting tags
t1 = time()
print " Predicting tags with model"
y_pred = crf.predict(X_input)
print y_pred[0]
print(" Prediction done in: %fs" % (time() - t1))
# Tagging with CRF model
print " Tagging file"
for line, tagLine in zip(lines, y_pred):
outputLine = ''
idx_tagLine = 0
line = line.strip('\n')
print "\nLine: " + str(line)
print "CRF tagged line: " + str(tagLine)
for token in line.split():
listToken = token.split('|')
word = listToken[0]
lemma = listToken[1]
tag = listToken[2]
if options.filterStopWords:
if lemma in stopwords:
outputLine += token + ' '
continue
if options.filterSymbols:
if lemma in symbols:
if lemma == ',':
print "Coma , identificada"
outputLine += token + ' '
continue
CRFtag = tagLine[idx_tagLine]
if (tag not in labels) and (CRFtag != 'O'):
print "*** CRF change token {} to {}".format(token, CRFtag)
outputLine += word + '|' + lemma + '|' + CRFtag + ' '
else:
outputLine += word + '|' + lemma + '|' + tag + ' '
idx_tagLine += 1
sentencesOutputData.append(outputLine.rstrip())
with open(os.path.join(options.outputPath, file), "w") as oFile:
for line in sentencesOutputData:
oFile.write(line + '\n')
print("Processing corpus done in: %fs" % (time() - t0))
# -*- coding: UTF-8 -*-
import os
from itertools import chain
from optparse import OptionParser
from time import time
from collections import Counter
import nltk
import sklearn
import scipy.stats
import sys
from sklearn.externals import joblib
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from nltk.corpus import stopwords
# Objective
# Training and evaluation of CRFs with sklearn-crfsuite.
#
# Input parameters
# --inputPath=PATH Path of training and test data set
# --trainingFile File with training data set
# --testFile File with test data set
# --outputPath=PATH Output path to place output files
# --filteringStopWords Filtering stop words
# --filterSymbols Filtering punctuation marks
# Output
# 1) Best model
# Examples
# Sentences
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_1.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_2.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_3.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile sentencesTraining.txt --testFile sentencesTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_4.txt
# Aspects
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS > output.TrainingTestingCRF.20161106_5.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords > output.TrainingTestingCRF.20161106_6.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterSymbols > output.TrainingTestingCRF.20161106_7.txt
# C:\Anaconda2\python trainingTesting_Sklearn_crfsuite.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS\trainingTest_Datasets --trainingFile aspectsTraining.txt --testFile aspectsTest.txt --outputPath C:\Users\cmendezc\Documents\GENOMICAS\AUTOMATIC_SUMMARIZATION_TFS\trainingTest_CRF_TERM_TAGS --filterStopWords --filterSymbols > output.TrainingTestingCRF.20161106_8.txt
#################################
# FUNCTIONS #
#################################
def wordSize(text):
lWord = len(text)
if lWord == 1:
return '1'
elif lWord == 2:
return '2'
elif lWord == 3:
return '3'
elif lWord == 4:
return '4'
elif lWord == 5:
return '5'
elif 6 <= lWord <= 10:
return '6-10'
elif 11 <= lWord <= 15:
return '11-15'
elif 16 <= lWord <= 20:
return '16-20'
elif 21 <= lWord <= 30:
return '21-30'
else:
return '>30'
def hasUpperLower(text):
has = False
if len(text) < 3:
return False
regexUp = nltk.re.compile('[A-Z]')
regexLo = nltk.re.compile('[a-z]')
if (regexUp.search(text) != None) and (regexLo.search(text) != None):
has = True
return has
def hasDigit(text):
has = False
if len(text) < 3:
return False
myRegex = nltk.re.compile('[0-9]')
if myRegex.search(text) != None:
has = True
return has
def hasNonAlphaNum(text):
has = False
if len(text) < 3:
return False
myRegex = nltk.re.compile('\W')
if myRegex.search(text) != None:
has = True
return has
def word2features(sent, i):
# print "i: " + str(i)
# print "sent[i]" + sent[i]
listElem = sent[i].split('|')
word = listElem[0]
lemma = listElem[1]
postag = listElem[2]
features = {
# Names of TF and genes change by lower and upper characters: 'word.lower()': word.lower(),
# Suffixes
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word[-1:]': word[-1:],
'word.isupper()': word.isupper(),
'word.istitle()': word.istitle(),
'word.hasDigit()': hasDigit(word),
'word.hasNonAlphaNum': hasNonAlphaNum(word),
# 'word.hasUpperLower': hasUpperLower(word),
#'wordSize': wordSize(word),
# 'word.isdigit()': word.isdigit(),
'word': word,
'lemma': lemma,
'lemma[-3:]': lemma[-3:],
'lemma[-2:]': lemma[-2:],
'lemma[-1:]': lemma[-1:],
'postag': postag,
# Prefixes
'postag[:2]': postag[:2],
'postag[:1]': postag[:1],
}
if i > 0:
listElem = sent[i - 1].split('|')
word1 = listElem[0]
lemma1 = listElem[1]
postag1 = listElem[2]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.istitle()': word1.istitle(),
'-1:word.isupper()': word1.isupper(),
'-1:word.hasDigit()': hasDigit(word1),
'-1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
# '-1:word.hasUpperLower': hasUpperLower(word1),
'-1:word': word1,
'-1:lemma': lemma1,
'-1:postag': postag1,
'-1:postag[:2]': postag1[:2],
'-1:postag[:1]': postag1[:1],
})
# else:
# features['BOS'] = True
if i < len(sent) - 1:
listElem = sent[i + 1].split('|')
word1 = listElem[0]
lemma1 = listElem[1]
postag1 = listElem[2]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.istitle()': word1.istitle(),
'+1:word.isupper()': word1.isupper(),
'+1:word.hasDigit()': hasDigit(word1),
'+1:word.hasNonAlphaNum': hasNonAlphaNum(word1),
# '+1:word.hasUpperLower': hasUpperLower(word1),
'+1:word': word1,
'+1:lemma': lemma1,
'+1:postag': postag1,
'+1:postag[:2]': postag1[:2],
'+1:postag[:1]': postag1[:1],
})
# else:
# features['EOS'] = True
if i > 1:
listElem = sent[i - 2].split('|')
word2 = listElem[0]
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'-2:word.lower()': word2.lower(),
'-2:word.istitle()': word2.istitle(),
'-2:word.isupper()': word2.isupper(),
'-2:word.hasDigit()': hasDigit(word2),
'-2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
# '-2:word.hasUpperLower': hasUpperLower(word2),
'-2:word': word2,
'-2:lemma': lemma2,
'-2:postag': postag2,
'-2:postag[:2]': postag2[:2],
'-2:postag[:1]': postag2[:1],
})
if i < len(sent) - 2:
listElem = sent[i + 2].split('|')
word2 = listElem[0]
lemma2 = listElem[1]
postag2 = listElem[2]
features.update({
'+2:word.lower()': word2.lower(),
'+2:word.istitle()': word2.istitle(),
'+2:word.isupper()': word2.isupper(),
'+2:word.hasDigit()': hasDigit(word2),
'+2:word.hasNonAlphaNum': hasNonAlphaNum(word2),
# '+2:word.hasUpperLower': hasUpperLower(word2),
'+2:word': word2,
'+2:lemma': lemma2,
'+2:postag': postag2,
'+2:postag[:2]': postag2[:2],
'+2:postag[:1]': postag2[:1],
})
trigrams = False
if trigrams:
if i > 2:
listElem = sent[i - 3].split('|')
word3 = listElem[0]
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'-3:word.lower()': word3.lower(),
'-3:word.istitle()': word3.istitle(),
'-3:word.isupper()': word3.isupper(),
'-3:word.hasDigit()': hasDigit(word3),
'-3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
# '-3:word.hasUpperLower': hasUpperLower(word3),
'-3:word': word3,
'-3:lemma': lemma3,
'-3:postag': postag3,
'-3:postag[:2]': postag3[:2],
'-3:postag[:1]': postag3[:1],
})
if i < len(sent) - 3:
listElem = sent[i + 3].split('|')
word3 = listElem[0]
lemma3 = listElem[1]
postag3 = listElem[2]
features.update({
'+3:word.lower()': word3.lower(),
'+3:word.istitle()': word3.istitle(),
'+3:word.isupper()': word3.isupper(),
'+3:word.hasDigit()': hasDigit(word3),
'+3:word.hasNonAlphaNum': hasNonAlphaNum(word3),
# '+3:word.hasUpperLower': hasUpperLower(word3),
'+3:word': word3,
'+3:lemma': lemma3,
'+3:postag': postag3,
'+3:postag[:2]': postag3[:2],
'+3:postag[:1]': postag3[:1],
})
return features
def sent2features(sent):
return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
return [elem.split('|')[3] for elem in sent]
# return [label for token, postag, label in sent]
def sent2tokens(sent):
return [token for token, postag, label in sent]
def print_transitions(trans_features, f):
for (label_from, label_to), weight in trans_features:
# f.write("%-6s -> %-7s %0.6f\n" % (label_from, label_to, weight))
# f.write("label_from :" + label_from)
# f.write("label_to :" + label_to)
# f.write("label_weight :" + weight)
# f.write("{} -> {} {:0.6f}\n".format(label_from.encode("utf-8"), label_to.encode("utf-8"), weight))
f.write("{:6} -> {:7} {:0.6f}\n".format(label_from, label_to, weight))
def print_state_features(state_features, f):
for (attr, label), weight in state_features:
# f.write("%0.6f %-8s %s\n" % (weight, label, attr))
# f.write(attr.encode("utf-8"))
# '{:06.2f}'.format(3.141592653589793)
f.write("{:0.6f} {:8} {}\n".format(weight, label, attr.encode("utf-8")))
__author__ = 'CMendezC'
##########################################
# MAIN PROGRAM #
##########################################
if __name__ == "__main__":
# Defining parameters
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path of training data set", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Output path to place output files",
metavar="PATH")
parser.add_option("--trainingFile", dest="trainingFile",
help="File with training data set", metavar="FILE")
parser.add_option("--testFile", dest="testFile",
help="File with test data set", metavar="FILE")
parser.add_option("--filterStopWords", default=False,
action="store_true", dest="filterStopWords",
help="Filtering stop words")
parser.add_option("--filterSymbols", default=False,
action="store_true", dest="filterSymbols",
help="Filtering punctuation marks")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Any parameter given.")
sys.exit(1)
print('-------------------------------- PARAMETERS --------------------------------')
print("Path of training data set: " + options.inputPath)
print("File with training data set: " + str(options.trainingFile))
print("Path of test data set: " + options.inputPath)
print("File with test data set: " + str(options.testFile))
print("Filtering stop words: " + str(options.filterStopWords))
symbols = ['.', ',', ':', ';', '?', '!', '\'', '"', '<', '>', '(', ')', '-', '_', '/', '\\', '¿', '¡', '+', '{',
'}', '[', ']', '*', '%', '$', '#', '&', '°', '`', '...']
print("Filtering symbols " + str(symbols) + ': ' + str(options.filterSymbols))
print('-------------------------------- PROCESSING --------------------------------')
print('Reading corpus...')
t0 = time()
sentencesTrainingData = []
sentencesTestData = []
stopwords = [word.decode('utf-8') for word in stopwords.words('english')]
with open(os.path.join(options.inputPath, options.trainingFile), "r") as iFile:
# with open(os.path.join(options.inputPath, options.trainingFile), "r", encoding="utf-8", errors='replace') as iFile:
for line in iFile.readlines():
listLine = []
line = line.decode("utf-8")
for token in line.strip('\n').split():
if options.filterStopWords:
listToken = token.split('|')
lemma = listToken[1]
# Original: if lemma in stopwords.words('english'):
# trainingTesting_Sklearn_crfsuite.py:269:
# UnicodeWarning: Unicode equal comparison failed to
# convert both arguments to Unicode -
# interpreting them as being unequal
if lemma in stopwords:
continue
if options.filterSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
# if lemma == ',':
# print "Coma , identificada"
continue
listLine.append(token)
sentencesTrainingData.append(listLine)
print " Sentences training data: " + str(len(sentencesTrainingData))
# print sentencesTrainingData[0]
with open(os.path.join(options.inputPath, options.testFile), "r") as iFile:
# with open(os.path.join(options.inputPath, options.testFile), "r", encoding="utf-8", errors='replace') as iFile:
for line in iFile.readlines():
listLine = []
line = line.decode("utf-8")
for token in line.strip('\n').split():
if options.filterStopWords:
listToken = token.split('|')
lemma = listToken[1]
# Original if lemma in stopwords.words('english'):
if lemma in stopwords:
continue
if options.filterSymbols:
listToken = token.split('|')
lemma = listToken[1]
if lemma in symbols:
# if lemma == ',':
# print "Coma , identificada"
continue
listLine.append(token)
sentencesTestData.append(listLine)
print " Sentences test data: " + str(len(sentencesTestData))
# print sentencesTestData[0]
print("Reading corpus done in: %fs" % (time() - t0))
print(sent2features(sentencesTrainingData[0])[0])
print(sent2features(sentencesTestData[0])[0])
# print(sent2labels(sentencesTrainingData[0]))
# print(sent2labels(sentencesTestData[0]))
t0 = time()
X_train = [sent2features(s) for s in sentencesTrainingData]
y_train = [sent2labels(s) for s in sentencesTrainingData]
X_test = [sent2features(s) for s in sentencesTestData]
# print X_test
y_test = [sent2labels(s) for s in sentencesTestData]
# Fixed parameters
# crf = sklearn_crfsuite.CRF(
# algorithm='lbfgs',
# c1=0.1,
# c2=0.1,
# max_iterations=100,
# all_possible_transitions=True
# )
# Hyperparameter Optimization
crf = sklearn_crfsuite.CRF(
algorithm='lbfgs',
max_iterations=100,
all_possible_transitions=True
)
params_space = {
'c1': scipy.stats.expon(scale=0.5),
'c2': scipy.stats.expon(scale=0.05),
}
# Original: labels = list(crf.classes_)
# Original: labels.remove('O')
labels = list(['MF', 'TF', 'DFAM', 'DMOT', 'DPOS', 'PRO'])
# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
average='weighted', labels=labels)
# search
rs = RandomizedSearchCV(crf, params_space,
cv=3,
verbose=3,
n_jobs=-1,
n_iter=20,
# n_iter=50,
scoring=f1_scorer)
rs.fit(X_train, y_train)
# Fixed parameters
# crf.fit(X_train, y_train)
# Best hiperparameters
# crf = rs.best_estimator_
nameReport = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
options.filterSymbols) + '.txt')
with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="w") as oFile:
oFile.write("********** TRAINING AND TESTING REPORT **********\n")
oFile.write("Training file: " + options.trainingFile + '\n')
oFile.write('\n')
oFile.write('best params:' + str(rs.best_params_) + '\n')
oFile.write('best CV score:' + str(rs.best_score_) + '\n')
oFile.write('model size: {:0.2f}M\n'.format(rs.best_estimator_.size_ / 1000000))
print("Training done in: %fs" % (time() - t0))
t0 = time()
# Update best crf
crf = rs.best_estimator_
# Saving model
print(" Saving training model...")
t1 = time()
nameModel = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
options.filterSymbols) + '.mod')
joblib.dump(crf, os.path.join(options.outputPath, "models", nameModel))
print(" Saving training model done in: %fs" % (time() - t1))
# Evaluation against test data
y_pred = crf.predict(X_test)
print("*********************************")
name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
options.filterSymbols) + '.txt')
with open(os.path.join(options.outputPath, "reports", "y_pred_" + name), "w") as oFile:
for y in y_pred:
oFile.write(str(y) + '\n')
print("*********************************")
name = options.trainingFile.replace('.txt', '.fStopWords_' + str(options.filterStopWords) + '.fSymbols_' + str(
options.filterSymbols) + '.txt')
with open(os.path.join(options.outputPath, "reports", "y_test_" + name), "w") as oFile:
for y in y_test:
oFile.write(str(y) + '\n')
print("Prediction done in: %fs" % (time() - t0))
# labels = list(crf.classes_)
# labels.remove('O')
with open(os.path.join(options.outputPath, "reports", "report_" + nameReport), mode="a") as oFile:
oFile.write('\n')
oFile.write("Flat F1: " + str(metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)))
oFile.write('\n')
# labels = list(crf.classes_)
sorted_labels = sorted(
labels,
key=lambda name: (name[1:], name[0])
)
oFile.write(metrics.flat_classification_report(
y_test, y_pred, labels=sorted_labels, digits=3
))
oFile.write('\n')
oFile.write("\nTop likely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common(50), oFile)
oFile.write('\n')
oFile.write("\nTop unlikely transitions:\n")
print_transitions(Counter(crf.transition_features_).most_common()[-50:], oFile)
oFile.write('\n')
oFile.write("\nTop positive:\n")
print_state_features(Counter(crf.state_features_).most_common(200), oFile)
oFile.write('\n')
oFile.write("\nTop negative:\n")
print_state_features(Counter(crf.state_features_).most_common()[-200:], oFile)
oFile.write('\n')