Sc

Carlos-Francisco Méndez-Cruz
Commit 7527bf12f270cb74f087e1328ae2283e1804c0de 7527bf12 0 parents
Showing 4 changed files with 252 additions and 0 deletions
.idea/vcs.xml
preprocessing-data/extract-manually-tagged-gcs.py
preprocessing-data/gzip-2-soft.py
preprocessing-data/soft-2-xml.py
--- a/.idea/vcs.xml 0 → 100644
View file @7527bf1
+++ b/.idea/vcs.xml 0 → 100644
View file @7527bf1
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>
\ No newline at end of file
--- a/preprocessing-data/extract-manually-tagged-gcs.py 0 → 100644
View file @7527bf1
+++ b/preprocessing-data/extract-manually-tagged-gcs.py 0 → 100644
View file @7527bf1
+# -*- coding: UTF-8 -*-
+
+from optparse import OptionParser
+import os
+import sys
+import re
+
+__author__ = 'CMendezC'
+
+# Objective: extract manually tagged growth conditions.
+
+# Parameters:
+#   1) --inputPath input path
+#   2) --outputPath output path
+
+# Ouput:
+#   1) Tab separated file
+
+# Execution:
+# python extract-manually-tagged-gcs.py
+# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data
+# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs
+# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path to read input files", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Path to place output files", metavar="PATH")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameter entered.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + str(options.inputPath))
+    print("Path to place output files: " + str(options.outputPath))
+
+    hashGcs = {}
+    regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<')
+    regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$')
+    regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$')
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        for f in files:
+            if f.endswith("_family.xml"):
+                print("Processing...{}/{}".format(options.inputPath, f))
+                with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
+                    for line in iFile:
+                        line = line.strip('\n')
+                        result = regexSerie.match(line)
+                        if result:
+                            serie = result.group('serie')
+                            if serie in hashGcs:
+                                print("WARNING! duplicate serie")
+                            else:
+                                hashGcs[serie] = {}
+                            continue
+                        result = regexSample.match(line)
+                        if result:
+                            sample = result.group('sample')
+                            if sample in hashGcs[serie]:
+                                print("WARNING! duplicate sample")
+                            else:
+                                hashGcs[serie][sample] = {}
+                                # hashGcs[serie] = hashSample
+                            #prevSample = sample
+                            continue
+                        result = regexTagContent.finditer(line)
+                        for m in result:
+                            tag = m.group('tag')
+                            content = m.group('content')
+                            content = content.strip()
+                            print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
+                            if tag in hashGcs[serie][sample]:
+                                if content in hashGcs[serie][sample][tag]:
+                                    print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace')))
+                                    pass # GC content already in hash
+                                else:
+                                    print("New content: {}".format(content))
+                                    hashGcs[serie][sample][tag].append(content)
+                                    print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag]))
+                            else:
+                                hashGcs[serie][sample][tag] = [content]
+                                print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace')))
+                            # print(hashGcs)
+                    tags = ["Technique", "Orgn", "Strain", "Substrain", "Gversion", "Gtype", "Phase", "Phase", "Air", "Med", "Temp", "Supp"]
+                    with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile:
+                        output = 'Serie\tSample\t'
+                        for tag in tags:
+                            output = output + tag + '\t'
+                        output = output.rstrip('\t')
+                        oFile.write(output + "\n")
+                        for serie, hashSample in hashGcs.items():
+                            print("Serie: {}".format(serie))
+                            for sample, hashTag in sorted(hashSample.items()):
+                                print("\tSample: {}".format(sample))
+                                pTags = []
+                                for tag in tags:
+                                    if tag in hashTag:
+                                        pTags.append(', '.join(hashTag[tag]))
+                                    else:
+                                        pTags.append('')
+
+                                output = '{}\t{}\t'.format(serie, sample)
+                                for tag in pTags:
+                                    output = output + tag + '\t'
+                                output = output.rstrip('\t')
+                                oFile.write(output + "\n")
+                                # oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp))
+                                # for tag, listContent in sorted(hashTag.items()):
+                                #     print("\t\tTag: {}".format(tag))
+                                #     for content in sorted(listContent):
+                                #         print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace')))
+                                #         # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
+                                #         oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content))
+
+
+
--- a/preprocessing-data/gzip-2-soft.py 0 → 100644
View file @7527bf1
+++ b/preprocessing-data/gzip-2-soft.py 0 → 100644
View file @7527bf1
+# -*- coding: UTF-8 -*-
+
+from optparse import OptionParser
+import os
+import sys
+import gzip
+import shutil
+
+__author__ = 'CMendezC'
+
+# Objective: uncompress gzip soft file to text soft file
+
+# Parameters:
+#   1) --inputPath input path
+#   2) --outputPath output path
+
+# Ouput:
+#   1) Text soft file
+
+# Execution:
+# python gzip-2-soft.py
+# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data
+# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
+# python gzip-2-soft.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path to read input files", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Path to place output files", metavar="PATH")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameter entered.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + str(options.inputPath))
+    print("Path to place output files: " + str(options.outputPath))
+
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        for f in files:
+            if f.endswith(".gz"):
+                print("Processing...{}/{}".format(options.inputPath, f))
+                try:
+                    with gzip.open(os.path.join(options.inputPath, f), 'rb') as f_in:
+                        with open(os.path.join(options.outputPath, f.replace('.soft.gz', '.txt')), 'wb') as f_out:
+                            shutil.copyfileobj(f_in, f_out)
+                except:
+                    pass
--- a/preprocessing-data/soft-2-xml.py 0 → 100644
View file @7527bf1
+++ b/preprocessing-data/soft-2-xml.py 0 → 100644
View file @7527bf1
+# -*- coding: UTF-8 -*-
+
+from optparse import OptionParser
+import os
+import sys
+
+__author__ = 'CMendezC'
+
+# Objective: convert soft file to XML file:
+#   include headings, tags, substitute &
+
+# Parameters:
+#   1) --inputPath input path
+#   2) --outputPath output path
+
+# Ouput:
+#   1) XML File with soft file content
+
+# Execution:
+# python soft-2-xml.py
+# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
+# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
+# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
+
+###########################################################
+#                       MAIN PROGRAM                      #
+###########################################################
+
+if __name__ == "__main__":
+    # Parameter definition
+    parser = OptionParser()
+    parser.add_option("--inputPath", dest="inputPath",
+                      help="Path to read input files", metavar="PATH")
+    parser.add_option("--outputPath", dest="outputPath",
+                      help="Path to place output files", metavar="PATH")
+
+    (options, args) = parser.parse_args()
+    if len(args) > 0:
+        parser.error("None parameter entered.")
+        sys.exit(1)
+
+    # Printing parameter values
+    print('-------------------------------- PARAMETERS --------------------------------')
+    print("Path to read input files: " + str(options.inputPath))
+    print("Path to place output files: " + str(options.outputPath))
+
+    # Walk directory to read files
+    for path, dirs, files in os.walk(options.inputPath):
+        for f in files:
+            if f.endswith("_family.txt"):
+                print("Processing...{}/{}".format(options.inputPath, f))
+                softText = ''
+                with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
+                    with open(os.path.join(options.outputPath, f.replace(".txt", ".xml")), "w",
+                              encoding="utf-8") as oFile:
+                        oFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n<gse xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\nxsi:noNamespaceSchemaLocation=\"esquema-gcs.xsd\">\n\n")
+                        for line in iFile:
+                            line = line.replace("&", "&amp;")
+                            oFile.write(line)
+                        oFile.write("\n</gse>\n")
+