Estefani Gaytan Nunez

organizacion nueva

Showing 53 changed files with 0 additions and 364 deletions
This diff could not be displayed because it is too large.
This diff could not be displayed because it is too large.
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
import re
__author__ = 'CMendezC'
# Objective: extract manually tagged growth conditions.
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1) Tab separated file
# Execution:
# python extract-manually-tagged-gcs.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
# python extract-manually-tagged-gcs.py
# --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs"
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
# python extract-manually-tagged-gcs.py --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs" --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place output files", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
hashGcs = {}
regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<')
regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$')
regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$')
# Tags from esquema-gcs.xsd at 11/09/2018
tags = ["Name", "Anti", "Orgn", "Strain", "Substrain", "Gtype", "Gversion", "Med", "Technique", "Supp", "Air", "Temp", "pH", "Press", "OD", "Phase", "Rate", "Vess", "Agit", ]
processed_files = 0
saved_files = 0
complete_report = []
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
for f in files:
if f.endswith("_family.xml"):
print("Processing...{} {}".format(options.inputPath, f))
#with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile:
with open(os.path.join(options.inputPath, f), "r", errors='replace') as iFile:
# numline = 0
for line in iFile:
# numline+=1
# if f.find("GSE41195") > -1:
# print(numline)
line = line.strip('\n')
result = regexSerie.match(line)
if result:
serie = result.group('serie')
if serie in hashGcs:
print("WARNING! duplicate serie")
else:
hashGcs[serie] = {}
continue
result = regexSample.match(line)
if result:
sample = result.group('sample')
if sample in hashGcs[serie]:
print("WARNING! duplicate sample")
else:
hashGcs[serie][sample] = {}
# hashGcs[serie] = hashSample
#prevSample = sample
continue
result = regexTagContent.finditer(line)
for m in result:
tag = m.group('tag')
content = m.group('content')
content = content.strip()
content = content.replace("&amp;", "&")
content = content.replace("&lt;", "<")
content = content.replace("&gt;", ">")
content = content.replace("&quot;", "\"")
content = content.replace("&apos;", "\'")
#print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
if tag in hashGcs[serie][sample]:
if content in hashGcs[serie][sample][tag]:
#print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace')))
pass # GC content already in hash
else:
# print("New content: {}".format(content))
hashGcs[serie][sample][tag].append(content)
# print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag]))
else:
hashGcs[serie][sample][tag] = [content]
#print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace')))
# print(hashGcs)
processed_files+=1
#with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile:
with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w") as oFile:
output = '"Serie","Sample",'
for tag in tags:
output = output + '"' + tag + '",'
output = output.rstrip(',')
oFile.write(output + "\n")
complete_report.append(output)
for serie, hashSample in hashGcs.items():
print("Serie: {}".format(serie))
for sample, hashTag in sorted(hashSample.items()):
print("\tSample: {}".format(sample))
pTags = []
for tag in tags:
if tag in hashTag:
pTags.append(', '.join(hashTag[tag]))
else:
pTags.append('')
output = '"{}","{}",'.format(serie, sample)
for tag in pTags:
output = output + '"' + tag + '",'
output = output.rstrip(',')
oFile.write(output + "\n")
complete_report.append(output)
# oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp))
# for tag, listContent in sorted(hashTag.items()):
# print("\t\tTag: {}".format(tag))
# for content in sorted(listContent):
# print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace')))
# # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace')))
# oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content))
saved_files+=1
with open(os.path.join(options.outputPath, "GSE_family.complete-report.csv"), "w") as oFile:
for line in complete_report:
oFile.write(line + "\n")
print("Processed files: {}".format(processed_files))
print("Saved files: {}".format(saved_files))
# -*- coding: UTF-8 -*-
import os
import sys
import argparse
import re
import numpy as np
from datetime import *
__author__ = 'KevinML'
# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo.
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1)
# Execution:
#Example 1
#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
#Example 2
#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py
#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/
#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/
###########################################################
# MAIN PROGRAM #
###########################################################
parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>',
epilog= 'Bien Hecho!')
parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True,
help='Ingrese el archivo de entrada.')
parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True,
help='Ingrese el archivo de salida.')
args = parser.parse_args()
#if len(args) != 2:
# parser.error("Se introdujeron mas o menos de 2 parametros.")
# sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(args.inputPath))
print("Path to place output files: " + str(args.outputPath))
#ModificCIO TEMPORAL
archivo = {}
regexTag = re.compile(r'<[A-Za-z]+>')
exit_file = r"exit_file.xml"
with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile:
oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today()))
for path, dirs, files in os.walk(args.inputPath):
for f in files:
metadatos = {}
with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile:
for line in iFile:
line = line.strip('\n')
if regexTag.search(line):
renglon = line.split(" = ")
if renglon[0] in metadatos:
metadatos[renglon[0]].append(renglon[1])
else:
metadatos[renglon[0]] = [renglon[1]]
archivo[f] = metadatos
with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile:
#oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido')
for arch in sorted(archivo):
for k,v in sorted(metadatos.items()):
for x in v:
oFile.write('{}\t{}\t{}\n'.format(arch, k, x))
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
import gzip
import shutil
__author__ = 'CMendezC'
# Objective: uncompress gzip soft file to text soft file
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1) Text soft file
# Execution:
# python gzip-2-soft.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
# python gzip-2-soft.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place output files", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
# Walk directory to read files
for path, dirs, files in os.walk(options.inputPath):
for f in files:
if f.endswith(".gz"):
print("Processing...{}/{}".format(options.inputPath, f))
try:
with gzip.open(os.path.join(options.inputPath, f), 'rb') as f_in:
with open(os.path.join(options.outputPath, f.replace('.soft.gz', '.txt')), 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
except:
pass
This diff is collapsed. Click to expand it.
# -*- coding: UTF-8 -*-
from optparse import OptionParser
import os
import sys
__author__ = 'CMendezC'
# Objective: convert soft file to XML file:
# include headings, tags, substitute & and <
# Parameters:
# 1) --inputPath input path
# 2) --outputPath output path
# Ouput:
# 1) XML File with soft file content
# Execution:
# python soft-2-xml.py
# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data-additional
# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
# Additional files
# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data-additional --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data
###########################################################
# MAIN PROGRAM #
###########################################################
if __name__ == "__main__":
# Parameter definition
parser = OptionParser()
parser.add_option("--inputPath", dest="inputPath",
help="Path to read input files", metavar="PATH")
parser.add_option("--outputPath", dest="outputPath",
help="Path to place output files", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("None parameter entered.")
sys.exit(1)
# Printing parameter values
print('-------------------------------- PARAMETERS --------------------------------')
print("Path to read input files: " + str(options.inputPath))
print("Path to place output files: " + str(options.outputPath))
# Walk directory to read files
processedFiles = 0
for path, dirs, files in os.walk(options.inputPath):
for f in files:
if f.endswith("_family.soft"):
print("Processing...{}/{}".format(options.inputPath, f))
softText = ''
with open(os.path.join(options.inputPath, f), "r", encoding="utf-8", errors="replace") as iFile:
with open(os.path.join(options.outputPath, f.replace(".soft", ".xml")), "w",
encoding="utf-8") as oFile:
oFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n<gse xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\nxsi:noNamespaceSchemaLocation=\"esquema-gcs.xsd\">\n\n")
for line in iFile:
line = line.replace("&", "&amp;")
line = line.replace("<", "&lt;")
# line = line.replace(">", "&gt;")
# line = line.replace("\"", "&quot;")
# line = line.replace("\'", "&apos;")
oFile.write(line)
oFile.write("\n</gse>\n")
processedFiles+=1
print("Processed files: {}".format(processedFiles))