Setting up project

Carlos-Francisco Méndez-Cruz
Commit 8d2b2c6e106861449f34af2f9de2771c719d11f3 8d2b2c6e 0 parents
Showing 5 changed files with 68 additions and 0 deletions
.idea/vcs.xml
prepare-abstracts.py
preparing-training-validation-test.py
tagging_Sklearn_crfsuite.py
training-validation.py
--- a/.idea/vcs.xml 0 → 100644
View file @8d2b2c6
+++ b/.idea/vcs.xml 0 → 100644
View file @8d2b2c6
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/prepare-abstracts.py 0 → 100644
View file @8d2b2c6
+++ b/prepare-abstracts.py 0 → 100644
View file @8d2b2c6
+# -*- coding: UTF-8 -*-
+
+from optparse import OptionParser
+import os
+import sys
+from time import time
+import re
+
+__author__ = 'CMendezC'
+
+# Objective: Take text-annotated-abstracts-original.txt as input
+# for obtaining abstracts separated in files without tags and collecting dictionary of genes
+# for tagging after NLP pipeline.
+
+# Parameters:
+#   1) --inputPath      Input path.
+#   2) --inputFile   Input file.
+#   3) --outputPath     Output path
+
+# Execution:
+#C:\Users\cmendezc\Documents\GENOMICAS\gitlab-conditional-random-fields\data-sets\original
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Input path", metavar="PATH")
+    parser.add_option("--inputFile", dest="inputFile",
+                      help="Input file", metavar="FILE")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Output path", metavar="PATH")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameters indicated.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Input path: " + str(options.inputPath))
+    print("Input file", str(options.inputFile))
+    print("Output path: " + str(options.outputPath))
+
+    filesWritten = 0
+    t0 = time()
+    hashGenes = {}
+
+    rePmid = re.compile(r'([\d])+\|a\|')
+    reGene = re.compile(r'<g>([^<]+)</g>')
+    with open(os.path.join(options.inputPath, options.inputFile), "r", encoding="utf-8", errors="replace") as iFile:
+        print("Reading file..." + options.inputFile)
+        for line in iFile:
+            line = line.strip('\n')
+            for gene in reGene.findall(line):
+                print("genes: {}".format(gene))
+            result = rePmid.match(line)
+            if result:
+                with open(os.path.join(options.outputPath, result.group(1) + ".txt"), "w", encoding="utf-8", errors="replace") as oFile:
+                    oFile.write(line)
+
+
+
--- a/preparing-training-validation-test.py 0 → 100644
View file @8d2b2c6
+++ b/preparing-training-validation-test.py 0 → 100644
View file @8d2b2c6
--- a/tagging_Sklearn_crfsuite.py 0 → 100644
View file @8d2b2c6
+++ b/tagging_Sklearn_crfsuite.py 0 → 100644
View file @8d2b2c6
--- a/training-validation.py 0 → 100644
View file @8d2b2c6
+++ b/training-validation.py 0 → 100644
View file @8d2b2c6