regex.py 3.29 KB
import fileinput
import re
import sys

if ( len( sys.argv ) < 3 ):
    sys.stderr.write( "E: usage: " +sys.argv[0] + " <input_file> <output_file> \n" )
    sys.stderr.flush();

    exit( 2 );
else:
    print("Ok.")

#LEER ARCHIVO INPUT
text_file = open( sys.argv[1], "r" )
dato = text_file.read().splitlines()
text_file.close()


#QUITA EXTENSION DE NOMBRE DE ARCHIVO
split_line = sys.argv[2]
split_line = split_line[:-4]
file_name=""
file_name = split_line + ".san"
open( file_name , 'w').close()

#ESCRIBIR REGEX EN ARGV 2
for line in dato:
    line = re.sub('[\(][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NNNNa_)
    line = re.sub('[\[][^\(|^\)]*\s[0-9]+[a-z]{1}\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NNNNa_]
    line = re.sub('[\(][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_NN,NN,NN_)
    line = re.sub('[\[][^\(|^\)]*\s([0-9]+,?)+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_NN,NN,NN_]
    line = re.sub('[\(][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num_)
    line = re.sub('[\(][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num.num_)
    line = re.sub('[\(][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\)]', '', line.rstrip()) #elimina (_num-num_)
    line = re.sub('[\[][^\(|^\)]*\s[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num_]
    line = re.sub('[\[][^\(|^\)]*\s[0-9]+\.[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num.num_]
    line = re.sub('[\[][^\(|^\)]*\s[0-9]+\-[0-9]+\s[^\(|^\)]*[\]]', '', line.rstrip()) #elimina [_num-num_]
    line = re.sub('[\(]\s[a-zA-Z]{1}\s[\)]', '', line.rstrip()) #elimina (_alpha_)
    line = re.sub('[\[]\s[a-zA-Z]{1}\s[\]]', '', line.rstrip()) #elimina [_alpha_]
    line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman_)
    line = re.sub('[\(]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\)]', '', line.rstrip()) #elimina (_Roman-Roman_)
    line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman_)
    line = re.sub('[\(]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\)]', '', line.rstrip()) #elimina (_roman-roman_)
    line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman_]
    line = re.sub('[\[]\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s\-\sM{0,4}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})\s[\]]', '', line.rstrip()) #elimina [_Roman-Roman_]
    line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman_]
    line = re.sub('[\[]\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s\-\sm{0,4}(cm|cd|d?c{0,3})(xc|xl|l?x{0,3})(ix|iv|v?i{0,3})\s[\]]', '', line.rstrip()) #elimina [_roman-roman_]
    line = re.sub('[\(][^\(|^\)]*\s(fig\s\.|figure|see|i\s\.\se\s\.|e\s\.\sg\s\.|tab\s\.table)\s[^\(|^\)]*[\)]', '', line.rstrip(), flags=re.I) #
    line = re.sub('  ', ' ', line.rstrip()) #elimina (_NNNNa_)
    #print(line)


    save_file = open( file_name, "a" )
    save_file.write(line)
    save_file.write("\n")
    save_file.close()