Showing
53 changed files
with
0 additions
and
364 deletions
No preview for this file type
GROWTH-CONDITIONS-GEO-EXTRACTION/CoreNLP/bin/get-raw-sentences.sh
→
CoreNLP/bin/get-raw-sentences.sh
File moved
data-sets/file_output/exit_file.txt
deleted
100644 → 0
This diff could not be displayed because it is too large.
data-sets/file_output/exit_file.xml
deleted
100644 → 0
This diff could not be displayed because it is too large.
data-sets/gzip-data/borrame.txt
deleted
100644 → 0
File mode changed
| 1 | -# -*- coding: UTF-8 -*- | ||
| 2 | - | ||
| 3 | -from optparse import OptionParser | ||
| 4 | -import os | ||
| 5 | -import sys | ||
| 6 | -import re | ||
| 7 | - | ||
| 8 | -__author__ = 'CMendezC' | ||
| 9 | - | ||
| 10 | -# Objective: extract manually tagged growth conditions. | ||
| 11 | - | ||
| 12 | -# Parameters: | ||
| 13 | -# 1) --inputPath input path | ||
| 14 | -# 2) --outputPath output path | ||
| 15 | - | ||
| 16 | -# Ouput: | ||
| 17 | -# 1) Tab separated file | ||
| 18 | - | ||
| 19 | -# Execution: | ||
| 20 | -# python extract-manually-tagged-gcs.py | ||
| 21 | -# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data | ||
| 22 | -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs | ||
| 23 | -# c:\anaconda3\python extract-manually-tagged-gcs.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\tagged-xml-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs | ||
| 24 | - | ||
| 25 | -# python extract-manually-tagged-gcs.py | ||
| 26 | -# --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs" | ||
| 27 | -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs | ||
| 28 | -# python extract-manually-tagged-gcs.py --inputPath "C:\Users\cmendezc\Dropbox (UNAM-CCG)\PGC-BC\Proyectos\O9-NLP\Growth Conditions_HT\etiquetado-manual-gcs" --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\report-manually-tagged-gcs | ||
| 29 | - | ||
| 30 | -########################################################### | ||
| 31 | -# MAIN PROGRAM # | ||
| 32 | -########################################################### | ||
| 33 | - | ||
| 34 | -if __name__ == "__main__": | ||
| 35 | - # Parameter definition | ||
| 36 | - parser = OptionParser() | ||
| 37 | - parser.add_option("--inputPath", dest="inputPath", | ||
| 38 | - help="Path to read input files", metavar="PATH") | ||
| 39 | - parser.add_option("--outputPath", dest="outputPath", | ||
| 40 | - help="Path to place output files", metavar="PATH") | ||
| 41 | - | ||
| 42 | - (options, args) = parser.parse_args() | ||
| 43 | - if len(args) > 0: | ||
| 44 | - parser.error("None parameter entered.") | ||
| 45 | - sys.exit(1) | ||
| 46 | - | ||
| 47 | - # Printing parameter values | ||
| 48 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
| 49 | - print("Path to read input files: " + str(options.inputPath)) | ||
| 50 | - print("Path to place output files: " + str(options.outputPath)) | ||
| 51 | - | ||
| 52 | - hashGcs = {} | ||
| 53 | - regexTagContent = re.compile(r'<(?P<tag>[^>]+)>(?P<content>[^<]+)<') | ||
| 54 | - regexSerie = re.compile(r'^\^SERIES = (?P<serie>GSE[0-9]+)$') | ||
| 55 | - regexSample = re.compile(r'^\^SAMPLE = (?P<sample>GSM[0-9]+)$') | ||
| 56 | - # Tags from esquema-gcs.xsd at 11/09/2018 | ||
| 57 | - tags = ["Name", "Anti", "Orgn", "Strain", "Substrain", "Gtype", "Gversion", "Med", "Technique", "Supp", "Air", "Temp", "pH", "Press", "OD", "Phase", "Rate", "Vess", "Agit", ] | ||
| 58 | - processed_files = 0 | ||
| 59 | - saved_files = 0 | ||
| 60 | - complete_report = [] | ||
| 61 | - # Walk directory to read files | ||
| 62 | - for path, dirs, files in os.walk(options.inputPath): | ||
| 63 | - for f in files: | ||
| 64 | - if f.endswith("_family.xml"): | ||
| 65 | - print("Processing...{} {}".format(options.inputPath, f)) | ||
| 66 | - #with open(os.path.join(options.inputPath, f), "r", encoding="utf-8") as iFile: | ||
| 67 | - with open(os.path.join(options.inputPath, f), "r", errors='replace') as iFile: | ||
| 68 | - # numline = 0 | ||
| 69 | - for line in iFile: | ||
| 70 | - # numline+=1 | ||
| 71 | - # if f.find("GSE41195") > -1: | ||
| 72 | - # print(numline) | ||
| 73 | - line = line.strip('\n') | ||
| 74 | - result = regexSerie.match(line) | ||
| 75 | - if result: | ||
| 76 | - serie = result.group('serie') | ||
| 77 | - if serie in hashGcs: | ||
| 78 | - print("WARNING! duplicate serie") | ||
| 79 | - else: | ||
| 80 | - hashGcs[serie] = {} | ||
| 81 | - continue | ||
| 82 | - result = regexSample.match(line) | ||
| 83 | - if result: | ||
| 84 | - sample = result.group('sample') | ||
| 85 | - if sample in hashGcs[serie]: | ||
| 86 | - print("WARNING! duplicate sample") | ||
| 87 | - else: | ||
| 88 | - hashGcs[serie][sample] = {} | ||
| 89 | - # hashGcs[serie] = hashSample | ||
| 90 | - #prevSample = sample | ||
| 91 | - continue | ||
| 92 | - result = regexTagContent.finditer(line) | ||
| 93 | - for m in result: | ||
| 94 | - tag = m.group('tag') | ||
| 95 | - content = m.group('content') | ||
| 96 | - content = content.strip() | ||
| 97 | - content = content.replace("&", "&") | ||
| 98 | - content = content.replace("<", "<") | ||
| 99 | - content = content.replace(">", ">") | ||
| 100 | - content = content.replace(""", "\"") | ||
| 101 | - content = content.replace("'", "\'") | ||
| 102 | - #print("\nSerie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace'))) | ||
| 103 | - if tag in hashGcs[serie][sample]: | ||
| 104 | - if content in hashGcs[serie][sample][tag]: | ||
| 105 | - #print("Duplicated content: {}".format(content.encode(encoding='utf-8', errors='replace'))) | ||
| 106 | - pass # GC content already in hash | ||
| 107 | - else: | ||
| 108 | - # print("New content: {}".format(content)) | ||
| 109 | - hashGcs[serie][sample][tag].append(content) | ||
| 110 | - # print("hashGcs[serie][sample][tag]: {}".format(hashGcs[serie][sample][tag])) | ||
| 111 | - else: | ||
| 112 | - hashGcs[serie][sample][tag] = [content] | ||
| 113 | - #print("New tag: {} and content: {}".format(tag, content.encode(encoding='utf-8', errors='replace'))) | ||
| 114 | - # print(hashGcs) | ||
| 115 | - processed_files+=1 | ||
| 116 | - #with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w", encoding="utf-8") as oFile: | ||
| 117 | - with open(os.path.join(options.outputPath, f.replace(".xml", ".report.csv")), "w") as oFile: | ||
| 118 | - output = '"Serie","Sample",' | ||
| 119 | - for tag in tags: | ||
| 120 | - output = output + '"' + tag + '",' | ||
| 121 | - output = output.rstrip(',') | ||
| 122 | - oFile.write(output + "\n") | ||
| 123 | - complete_report.append(output) | ||
| 124 | - for serie, hashSample in hashGcs.items(): | ||
| 125 | - print("Serie: {}".format(serie)) | ||
| 126 | - for sample, hashTag in sorted(hashSample.items()): | ||
| 127 | - print("\tSample: {}".format(sample)) | ||
| 128 | - pTags = [] | ||
| 129 | - for tag in tags: | ||
| 130 | - if tag in hashTag: | ||
| 131 | - pTags.append(', '.join(hashTag[tag])) | ||
| 132 | - else: | ||
| 133 | - pTags.append('') | ||
| 134 | - | ||
| 135 | - output = '"{}","{}",'.format(serie, sample) | ||
| 136 | - for tag in pTags: | ||
| 137 | - output = output + '"' + tag + '",' | ||
| 138 | - output = output.rstrip(',') | ||
| 139 | - oFile.write(output + "\n") | ||
| 140 | - complete_report.append(output) | ||
| 141 | - # oFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(sample, serie, Technique, Orgn, Strain, Substrain, Gversion, Gtype, Phase, Air, Med, Temp, Supp)) | ||
| 142 | - # for tag, listContent in sorted(hashTag.items()): | ||
| 143 | - # print("\t\tTag: {}".format(tag)) | ||
| 144 | - # for content in sorted(listContent): | ||
| 145 | - # print("\t\t\tContent: {}".format(content.encode(encoding='utf-8', errors='replace'))) | ||
| 146 | - # # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}".format(serie, sample, tag, content.encode(encoding='utf-8', errors='replace'))) | ||
| 147 | - # oFile.write("Serie: {}\tSample: {}\tTag: {}\tContent: {}\n".format(serie, sample, tag, content)) | ||
| 148 | - saved_files+=1 | ||
| 149 | - | ||
| 150 | - with open(os.path.join(options.outputPath, "GSE_family.complete-report.csv"), "w") as oFile: | ||
| 151 | - for line in complete_report: | ||
| 152 | - oFile.write(line + "\n") | ||
| 153 | - | ||
| 154 | - print("Processed files: {}".format(processed_files)) | ||
| 155 | - print("Saved files: {}".format(saved_files)) | ||
| 156 | - | ||
| 157 | - |
data-sets/scripts/file_output.py
deleted
100644 → 0
| 1 | -# -*- coding: UTF-8 -*- | ||
| 2 | -import os | ||
| 3 | -import sys | ||
| 4 | -import argparse | ||
| 5 | -import re | ||
| 6 | -import numpy as np | ||
| 7 | -from datetime import * | ||
| 8 | -__author__ = 'KevinML' | ||
| 9 | - | ||
| 10 | -# Objective: Obtenecion del metadato y del contenido de todas las lineas con <Tags/> detro de un erchivo. | ||
| 11 | - | ||
| 12 | -# Parameters: | ||
| 13 | -# 1) --inputPath input path | ||
| 14 | -# 2) --outputPath output path | ||
| 15 | - | ||
| 16 | -# Ouput: | ||
| 17 | -# 1) | ||
| 18 | - | ||
| 19 | -# Execution: | ||
| 20 | -#Example 1 | ||
| 21 | -#python3 recorrer_archivos_o.py --inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ | ||
| 22 | -#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/ | ||
| 23 | - | ||
| 24 | -#Example 2 | ||
| 25 | -#python3 /home/kevinml/automatic-extraction-growth-conditions/scripts/recorrer_archivos_o.py | ||
| 26 | -#--inputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/tagged-xml-data/ | ||
| 27 | -#--outputPath /home/kevinml/automatic-extraction-growth-conditions/data-sets/output-kevin/ | ||
| 28 | - | ||
| 29 | -########################################################### | ||
| 30 | -# MAIN PROGRAM # | ||
| 31 | -########################################################### | ||
| 32 | - | ||
| 33 | -parser = argparse.ArgumentParser(description='Obtenecion de metadatos y del contenido de de lineas con <Tags/>', | ||
| 34 | - epilog= 'Bien Hecho!') | ||
| 35 | -parser.add_argument('--inputPath', dest='inputPath', metavar='PATH', required = True, | ||
| 36 | - help='Ingrese el archivo de entrada.') | ||
| 37 | -parser.add_argument('--outputPath', dest='outputPath', metavar='PATH', required = True, | ||
| 38 | - help='Ingrese el archivo de salida.') | ||
| 39 | - | ||
| 40 | -args = parser.parse_args() | ||
| 41 | - | ||
| 42 | -#if len(args) != 2: | ||
| 43 | -# parser.error("Se introdujeron mas o menos de 2 parametros.") | ||
| 44 | -# sys.exit(1) | ||
| 45 | - | ||
| 46 | -# Printing parameter values | ||
| 47 | -print('-------------------------------- PARAMETERS --------------------------------') | ||
| 48 | -print("Path to read input files: " + str(args.inputPath)) | ||
| 49 | -print("Path to place output files: " + str(args.outputPath)) | ||
| 50 | - | ||
| 51 | -#ModificCIO TEMPORAL | ||
| 52 | - | ||
| 53 | -archivo = {} | ||
| 54 | -regexTag = re.compile(r'<[A-Za-z]+>') | ||
| 55 | -exit_file = r"exit_file.xml" | ||
| 56 | - | ||
| 57 | -with open(os.path.join(args.outputPath, exit_file), mode = "w") as oFile: | ||
| 58 | - oFile.write('#Fecha:{}\t\t\n#Archivo\tMetadato\tContenido\n\n'.format(datetime.today())) | ||
| 59 | - | ||
| 60 | -for path, dirs, files in os.walk(args.inputPath): | ||
| 61 | - for f in files: | ||
| 62 | - metadatos = {} | ||
| 63 | - with open(os.path.join(args.inputPath, f), mode ='r', encoding ="utf-8") as iFile: | ||
| 64 | - for line in iFile: | ||
| 65 | - line = line.strip('\n') | ||
| 66 | - if regexTag.search(line): | ||
| 67 | - renglon = line.split(" = ") | ||
| 68 | - if renglon[0] in metadatos: | ||
| 69 | - metadatos[renglon[0]].append(renglon[1]) | ||
| 70 | - else: | ||
| 71 | - metadatos[renglon[0]] = [renglon[1]] | ||
| 72 | - | ||
| 73 | - archivo[f] = metadatos | ||
| 74 | - | ||
| 75 | - with open(os.path.join(args.outputPath, exit_file), mode = "a") as oFile: | ||
| 76 | - #oFile.write('Archivo\t' + 'Metadato\t' + 'Contenido') | ||
| 77 | - for arch in sorted(archivo): | ||
| 78 | - for k,v in sorted(metadatos.items()): | ||
| 79 | - for x in v: | ||
| 80 | - oFile.write('{}\t{}\t{}\n'.format(arch, k, x)) |
data-sets/scripts/gzip-2-soft.py
deleted
100644 → 0
| 1 | -# -*- coding: UTF-8 -*- | ||
| 2 | - | ||
| 3 | -from optparse import OptionParser | ||
| 4 | -import os | ||
| 5 | -import sys | ||
| 6 | -import gzip | ||
| 7 | -import shutil | ||
| 8 | - | ||
| 9 | -__author__ = 'CMendezC' | ||
| 10 | - | ||
| 11 | -# Objective: uncompress gzip soft file to text soft file | ||
| 12 | - | ||
| 13 | -# Parameters: | ||
| 14 | -# 1) --inputPath input path | ||
| 15 | -# 2) --outputPath output path | ||
| 16 | - | ||
| 17 | -# Ouput: | ||
| 18 | -# 1) Text soft file | ||
| 19 | - | ||
| 20 | -# Execution: | ||
| 21 | -# python gzip-2-soft.py | ||
| 22 | -# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data | ||
| 23 | -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data | ||
| 24 | -# python gzip-2-soft.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\gzip-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data | ||
| 25 | - | ||
| 26 | -########################################################### | ||
| 27 | -# MAIN PROGRAM # | ||
| 28 | -########################################################### | ||
| 29 | - | ||
| 30 | -if __name__ == "__main__": | ||
| 31 | - # Parameter definition | ||
| 32 | - parser = OptionParser() | ||
| 33 | - parser.add_option("--inputPath", dest="inputPath", | ||
| 34 | - help="Path to read input files", metavar="PATH") | ||
| 35 | - parser.add_option("--outputPath", dest="outputPath", | ||
| 36 | - help="Path to place output files", metavar="PATH") | ||
| 37 | - | ||
| 38 | - (options, args) = parser.parse_args() | ||
| 39 | - if len(args) > 0: | ||
| 40 | - parser.error("None parameter entered.") | ||
| 41 | - sys.exit(1) | ||
| 42 | - | ||
| 43 | - # Printing parameter values | ||
| 44 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
| 45 | - print("Path to read input files: " + str(options.inputPath)) | ||
| 46 | - print("Path to place output files: " + str(options.outputPath)) | ||
| 47 | - | ||
| 48 | - # Walk directory to read files | ||
| 49 | - for path, dirs, files in os.walk(options.inputPath): | ||
| 50 | - for f in files: | ||
| 51 | - if f.endswith(".gz"): | ||
| 52 | - print("Processing...{}/{}".format(options.inputPath, f)) | ||
| 53 | - try: | ||
| 54 | - with gzip.open(os.path.join(options.inputPath, f), 'rb') as f_in: | ||
| 55 | - with open(os.path.join(options.outputPath, f.replace('.soft.gz', '.txt')), 'wb') as f_out: | ||
| 56 | - shutil.copyfileobj(f_in, f_out) | ||
| 57 | - except: | ||
| 58 | - pass |
data-sets/scripts/output.txt
deleted
100644 → 0
This diff is collapsed. Click to expand it.
data-sets/scripts/soft-2-xml.py
deleted
100644 → 0
| 1 | -# -*- coding: UTF-8 -*- | ||
| 2 | - | ||
| 3 | -from optparse import OptionParser | ||
| 4 | -import os | ||
| 5 | -import sys | ||
| 6 | - | ||
| 7 | -__author__ = 'CMendezC' | ||
| 8 | - | ||
| 9 | -# Objective: convert soft file to XML file: | ||
| 10 | -# include headings, tags, substitute & and < | ||
| 11 | - | ||
| 12 | -# Parameters: | ||
| 13 | -# 1) --inputPath input path | ||
| 14 | -# 2) --outputPath output path | ||
| 15 | - | ||
| 16 | -# Ouput: | ||
| 17 | -# 1) XML File with soft file content | ||
| 18 | - | ||
| 19 | -# Execution: | ||
| 20 | -# python soft-2-xml.py | ||
| 21 | -# --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data-additional | ||
| 22 | -# --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data | ||
| 23 | -# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data | ||
| 24 | -# Additional files | ||
| 25 | -# python soft-2-xml.py --inputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\soft-data-additional --outputPath C:\Users\cmendezc\Documents\GENOMICAS\gitlab_automatic-extraction-growth-conditions\data-sets\xml-data | ||
| 26 | - | ||
| 27 | -########################################################### | ||
| 28 | -# MAIN PROGRAM # | ||
| 29 | -########################################################### | ||
| 30 | - | ||
| 31 | -if __name__ == "__main__": | ||
| 32 | - # Parameter definition | ||
| 33 | - parser = OptionParser() | ||
| 34 | - parser.add_option("--inputPath", dest="inputPath", | ||
| 35 | - help="Path to read input files", metavar="PATH") | ||
| 36 | - parser.add_option("--outputPath", dest="outputPath", | ||
| 37 | - help="Path to place output files", metavar="PATH") | ||
| 38 | - | ||
| 39 | - (options, args) = parser.parse_args() | ||
| 40 | - if len(args) > 0: | ||
| 41 | - parser.error("None parameter entered.") | ||
| 42 | - sys.exit(1) | ||
| 43 | - | ||
| 44 | - # Printing parameter values | ||
| 45 | - print('-------------------------------- PARAMETERS --------------------------------') | ||
| 46 | - print("Path to read input files: " + str(options.inputPath)) | ||
| 47 | - print("Path to place output files: " + str(options.outputPath)) | ||
| 48 | - | ||
| 49 | - # Walk directory to read files | ||
| 50 | - processedFiles = 0 | ||
| 51 | - for path, dirs, files in os.walk(options.inputPath): | ||
| 52 | - for f in files: | ||
| 53 | - if f.endswith("_family.soft"): | ||
| 54 | - print("Processing...{}/{}".format(options.inputPath, f)) | ||
| 55 | - softText = '' | ||
| 56 | - with open(os.path.join(options.inputPath, f), "r", encoding="utf-8", errors="replace") as iFile: | ||
| 57 | - with open(os.path.join(options.outputPath, f.replace(".soft", ".xml")), "w", | ||
| 58 | - encoding="utf-8") as oFile: | ||
| 59 | - oFile.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n\n<gse xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\nxsi:noNamespaceSchemaLocation=\"esquema-gcs.xsd\">\n\n") | ||
| 60 | - for line in iFile: | ||
| 61 | - line = line.replace("&", "&") | ||
| 62 | - line = line.replace("<", "<") | ||
| 63 | - # line = line.replace(">", ">") | ||
| 64 | - # line = line.replace("\"", """) | ||
| 65 | - # line = line.replace("\'", "'") | ||
| 66 | - oFile.write(line) | ||
| 67 | - oFile.write("\n</gse>\n") | ||
| 68 | - processedFiles+=1 | ||
| 69 | - print("Processed files: {}".format(processedFiles)) |
File mode changed
data-sets/soft-data/borrame.txt.txt
deleted
100644 → 0
File mode changed
data-sets/xml-data/borrame.txt
deleted
100644 → 0
File mode changed
-
Please register or login to post a comment