Showing
1 changed file
with
32 additions
and
0 deletions
scripts/split_partition.py
0 → 100644
1 | +#!/usr/bin/python | ||
2 | +#SPLIT PARTITION: Recibe un archivo con listas de número de oraciones, indicando a qué cluster pertenecen. Genera un archivo con cada uno de los clusters | ||
3 | +from optparse import OptionParser | ||
4 | +import re | ||
5 | + | ||
6 | +# Recibir input y output | ||
7 | +parser = OptionParser() | ||
8 | +parser.add_option("-i", dest="inF",help="Input text file. Clusters of sentences starting with cluster name indicated by '#'. Each sentence must be followed by _s", metavar="PATH") | ||
9 | +parser.add_option("-o", dest="otF",help="output directory", metavar="PATH") | ||
10 | + | ||
11 | +(options, args) = parser.parse_args() | ||
12 | +if len(args) > 0: | ||
13 | + parser.error("Please indicate an input file") | ||
14 | + sys.exit(1) | ||
15 | + | ||
16 | +# Asignar variables | ||
17 | +infile = options.inF | ||
18 | +outdir = options.otF | ||
19 | + | ||
20 | +#Separar oraciones pertenecientes a los clusters e imprimirlas en archivos separados por cluster | ||
21 | +with open (infile, 'r') as clusterfile, open ('/home/mrocha/storage/embeddings/gene-disease-embeddings/corpora/articles-titles.txt', 'r') as sentencefile: | ||
22 | + sentences = sentencefile.readlines() | ||
23 | + for line in clusterfile: | ||
24 | + if (re.search('\#Cluster:\s+\d', line)): | ||
25 | + cl = re.search('\#Cluster:\s+(\d)', line) | ||
26 | + num = str(cl.group(1)) | ||
27 | + name=outdir+"/cluster"+num+".txt" | ||
28 | + out = open(name, 'w') | ||
29 | + elif (re.search('(\d+)_s', line)): | ||
30 | + num=re.search('(\d+)_s', line) | ||
31 | + out.write(sentences[int(num.group(1))-1]) | ||
32 | +out.close() | ||
... | \ No newline at end of file | ... | \ No newline at end of file |
-
Please register or login to post a comment