extract-manually-tagged-gcs.py 6.87 KB

Raw Blame History Permalink

# -*- coding: UTF-8 -*-

from optparse import OptionParser
import os
import sys
import re

__author__ = 'CMendezC'

# Objective: extract manually tagged growth conditions.

# Parameters:
#   1) --inputPath input path
#   2) --outputPath output path

# Ouput:
#   1) Tab separated file

# Execution:
# python extract-manually-tagged-gcs.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs
# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\report-manually-tagged-gcs

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Path to read input files", metavar="PATH")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Path to place output files", metavar="PATH")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("None parameter entered.")
        sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to read input files: " + str(options.inputPath))
    print("Path to place output files: " + str(options.outputPath))

    hashGcs = {}
    regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<')
    regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$')
    regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$')
    # Walk directory to read files
    for path, dirs, files in os.walk(options.inputPath):
        for f in files:
            if f.endswith("_family.xml"):
                print("Processing...{}/{}".format(options.inputPath, f))
                with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
                    for line in iFile:
                        line = line.strip('\n')
                        result = regexSerie.match(line)
                        if result:
                            serie = result.group('serie')
                            if serie in hashGcs:
                                print("WARNING! duplicate serie")
                            else:
                                hashGcs[serie] = {}
                            continue
                        result = regexSample.match(line)
                        if result:
                            sample = result.group('sample')
                            if sample in hashGcs[serie]:
                                print("WARNING! duplicate sample")
                            else:
                                hashGcs[serie][sample] = {}
                                # hashGcs[serie] = hashSample
                            #prevSample = sample
                            continue
                        result = regexTagContent.finditer(line)
                        for m in result:
                            tag = m.group('tag')
                            content = m.group('content')
                            content = content.strip()
                            print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
                            if tag in hashGcs[serie][sample]:
                                if content in hashGcs[serie][sample][tag]:
                                    print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace')))
                                    pass # GC content already in hash
                                else:
                                    print("New content: {}".format(content))
                                    hashGcs[serie][sample][tag].append(content)
                                    print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag]))
                            else:
                                hashGcs[serie][sample][tag] = [content]
                                print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace')))
                            # print(hashGcs)
                    tags = ["Technique", "Orgn", "Strain", "Substrain", "Gversion", "Gtype", "Phase", "Phase", "Air", "Med", "Temp", "Supp"]
                    with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile:
                        output = 'Serie\tSample\t'
                        for tag in tags:
                            output = output + tag + '\t'
                        output = output.rstrip('\t')
                        oFile.write(output + "\n")
                        for serie, hashSample in hashGcs.items():
                            print("Serie: {}".format(serie))
                            for sample, hashTag in sorted(hashSample.items()):
                                print("\tSample: {}".format(sample))
                                pTags = []
                                for tag in tags:
                                    if tag in hashTag:
                                        pTags.append(', '.join(hashTag[tag]))
                                    else:
                                        pTags.append('')

                                output = '{}\t{}\t'.format(serie, sample)
                                for tag in pTags:
                                    output = output + tag + '\t'
                                output = output.rstrip('\t')
                                oFile.write(output + "\n")
                                # oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp))
                                # for tag, listContent in sorted(hashTag.items()):
                                #     print("\t\tTag: {}".format(tag))
                                #     for content in sorted(listContent):
                                #         print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace')))
                                #         # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
                                #         oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content))