extract-manually-tagged-gcs.py 8.74 KB
# -*- coding: UTF-8 -*-

from optparse import OptionParser
import os
import sys
import re

__author__ = 'CMendezC'

# Objective: extract manually tagged growth conditions.

# Parameters:
#   1) --inputPath input path
#   2) --outputPath output path

# Ouput:
#   1) Tab separated file

# Execution:
# python extract-manually-tagged-gcs.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs

# python extract-manually-tagged-gcs.py
# --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs"
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
# python extract-manually-tagged-gcs.py --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs" --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs

###########################################################
#                       MAIN PROGRAM                      #
###########################################################

if __name__ == "__main__":
    # Parameter definition
    parser = OptionParser()
    parser.add_option("--inputPath", dest="inputPath",
                      help="Path to read input files", metavar="PATH")
    parser.add_option("--outputPath", dest="outputPath",
                      help="Path to place output files", metavar="PATH")

    (options, args) = parser.parse_args()
    if len(args) > 0:
        parser.error("None parameter entered.")
        sys.exit(1)

    # Printing parameter values
    print('-------------------------------- PARAMETERS --------------------------------')
    print("Path to read input files: " + str(options.inputPath))
    print("Path to place output files: " + str(options.outputPath))

    hashGcs = {}
    regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<')
    regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$')
    regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$')
    # Tags from esquema-gcs.xsd at 11/09/2018
    tags = ["Name", "Anti", "Orgn", "Strain", "Substrain", "Gtype", "Gversion", "Med", "Technique", "Supp", "Air", "Temp", "pH", "Press", "OD", "Phase", "Rate", "Vess", "Agit", ]
    processed_files = 0
    saved_files = 0
    complete_report = []
    # Walk directory to read files
    for path, dirs, files in os.walk(options.inputPath):
        for f in files:
            if f.endswith("_family.xml"):
                print("Processing...{} {}".format(options.inputPath, f))
                #with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
                with open(os.path.join(options.inputPath, f), "r", errors='replace') as iFile:
                    # numline = 0
                    for line in iFile:
                        # numline+=1
                        # if f.find("GSE41195") > -1:
                        #     print(numline)
                        line = line.strip('\n')
                        result = regexSerie.match(line)
                        if result:
                            serie = result.group('serie')
                            if serie in hashGcs:
                                print("WARNING! duplicate serie")
                            else:
                                hashGcs[serie] = {}
                            continue
                        result = regexSample.match(line)
                        if result:
                            sample = result.group('sample')
                            if sample in hashGcs[serie]:
                                print("WARNING! duplicate sample")
                            else:
                                hashGcs[serie][sample] = {}
                                # hashGcs[serie] = hashSample
                            #prevSample = sample
                            continue
                        result = regexTagContent.finditer(line)
                        for m in result:
                            tag = m.group('tag')
                            content = m.group('content')
                            content = content.strip()
                            content = content.replace("&amp;", "&")
                            content = content.replace("&lt;", "<")
                            content = content.replace("&gt;", ">")
                            content = content.replace("&quot;", "\"")
                            content = content.replace("&apos;", "\'")
                            #print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
                            if tag in hashGcs[serie][sample]:
                                if content in hashGcs[serie][sample][tag]:
                                    #print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace')))
                                    pass # GC content already in hash
                                else:
                                    # print("New content: {}".format(content))
                                    hashGcs[serie][sample][tag].append(content)
                                    # print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag]))
                            else:
                                hashGcs[serie][sample][tag] = [content]
                                #print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace')))
                            # print(hashGcs)
                    processed_files+=1
                    #with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile:
                    with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w") as oFile:
                        output = '"Serie","Sample",'
                        for tag in tags:
                            output = output + '"' + tag + '",'
                        output = output.rstrip(',')
                        oFile.write(output + "\n")
                        complete_report.append(output)
                        for serie, hashSample in hashGcs.items():
                            print("Serie: {}".format(serie))
                            for sample, hashTag in sorted(hashSample.items()):
                                print("\tSample: {}".format(sample))
                                pTags = []
                                for tag in tags:
                                    if tag in hashTag:
                                        pTags.append(', '.join(hashTag[tag]))
                                    else:
                                        pTags.append('')

                                output = '"{}","{}",'.format(serie, sample)
                                for tag in pTags:
                                    output = output + '"' + tag + '",'
                                output = output.rstrip(',')
                                oFile.write(output + "\n")
                                complete_report.append(output)
                                # oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp))
                                # for tag, listContent in sorted(hashTag.items()):
                                #     print("\t\tTag: {}".format(tag))
                                #     for content in sorted(listContent):
                                #         print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace')))
                                #         # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
                                #         oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content))
                        saved_files+=1

    with open(os.path.join(options.outputPath, "GSE_family.complete-report.csv"), "w") as oFile:
        for line in complete_report:
            oFile.write(line + "\n")

    print("Processed files: {}".format(processed_files))
    print("Saved files: {}".format(saved_files))