split_partition.py
1.35 KB
#!/usr/bin/python
#SPLIT PARTITION: Recibe un archivo con listas de número de oraciones, indicando a qué cluster pertenecen. Genera un archivo con cada uno de los clusters
from optparse import OptionParser
import re
# Recibir input y output
parser = OptionParser()
parser.add_option("-i", dest="inF",help="Input text file. Clusters of sentences starting with cluster name indicated by '#'. Each sentence must be followed by _s", metavar="PATH")
parser.add_option("-o", dest="otF",help="output directory", metavar="PATH")
(options, args) = parser.parse_args()
if len(args) > 0:
parser.error("Please indicate an input file")
sys.exit(1)
# Asignar variables
infile = options.inF
outdir = options.otF
#Separar oraciones pertenecientes a los clusters e imprimirlas en archivos separados por cluster
with open (infile, 'r') as clusterfile, open ('/home/mrocha/storage/embeddings/gene-disease-embeddings/corpora/articles-titles.txt', 'r') as sentencefile:
sentences = sentencefile.readlines()
for line in clusterfile:
if (re.search('\#Cluster:\s+\d', line)):
cl = re.search('\#Cluster:\s+(\d)', line)
num = str(cl.group(1))
name=outdir+"/cluster"+num+".txt"
out = open(name, 'w')
elif (re.search('(\d+)_s', line)):
num=re.search('(\d+)_s', line)
out.write(sentences[int(num.group(1))-1])
out.close()