larisa

Upload new file

1 +#!/usr/bin/python
2 +#SPLIT PARTITION: Recibe un archivo con listas de número de oraciones, indicando a qué cluster pertenecen. Genera un archivo con cada uno de los clusters
3 +from optparse import OptionParser
4 +import re
5 +
6 +# Recibir input y output
7 +parser = OptionParser()
8 +parser.add_option("-i", dest="inF",help="Input text file. Clusters of sentences starting with cluster name indicated by '#'. Each sentence must be followed by _s", metavar="PATH")
9 +parser.add_option("-o", dest="otF",help="output directory", metavar="PATH")
10 +
11 +(options, args) = parser.parse_args()
12 +if len(args) > 0:
13 + parser.error("Please indicate an input file")
14 + sys.exit(1)
15 +
16 +# Asignar variables
17 +infile = options.inF
18 +outdir = options.otF
19 +
20 +#Separar oraciones pertenecientes a los clusters e imprimirlas en archivos separados por cluster
21 +with open (infile, 'r') as clusterfile, open ('/home/mrocha/storage/embeddings/gene-disease-embeddings/corpora/articles-titles.txt', 'r') as sentencefile:
22 + sentences = sentencefile.readlines()
23 + for line in clusterfile:
24 + if (re.search('\#Cluster:\s+\d', line)):
25 + cl = re.search('\#Cluster:\s+(\d)', line)
26 + num = str(cl.group(1))
27 + name=outdir+"/cluster"+num+".txt"
28 + out = open(name, 'w')
29 + elif (re.search('(\d+)_s', line)):
30 + num=re.search('(\d+)_s', line)
31 + out.write(sentences[int(num.group(1))-1])
32 +out.close()
...\ No newline at end of file ...\ No newline at end of file