split_partition.py 1.35 KB
#!/usr/bin/python
#SPLIT PARTITION: Recibe un archivo con listas de número de oraciones, indicando a qué cluster pertenecen. Genera un archivo con cada uno de los clusters
from optparse import OptionParser
import re

# Recibir input y output
parser = OptionParser()
parser.add_option("-i", dest="inF",help="Input text file. Clusters of sentences starting with cluster name indicated by '#'. Each sentence must be followed by _s", metavar="PATH")
parser.add_option("-o", dest="otF",help="output directory", metavar="PATH")

(options, args) = parser.parse_args()
if len(args) > 0:
    parser.error("Please indicate an input file")
    sys.exit(1)

# Asignar variables
infile = options.inF
outdir = options.otF

#Separar oraciones pertenecientes a los clusters e imprimirlas en archivos separados por cluster
with open (infile, 'r') as clusterfile, open ('/home/mrocha/storage/embeddings/gene-disease-embeddings/corpora/articles-titles.txt', 'r') as sentencefile:
   sentences = sentencefile.readlines()
   for line in clusterfile:
      if (re.search('\#Cluster:\s+\d', line)):
         cl = re.search('\#Cluster:\s+(\d)', line)
         num = str(cl.group(1))
         name=outdir+"/cluster"+num+".txt"
         out = open(name, 'w')
      elif (re.search('(\d+)_s', line)):
         num=re.search('(\d+)_s', line)
         out.write(sentences[int(num.group(1))-1])
out.close()