Script para generar archivos para CoreNLP (regex NER).

Kevin Meza Landeros
Commit ad2a3c1997c42a9955a8f3085f8a346e4b2cfa7f ad2a3c19 1 parent 3053afd6
Showing 1 changed file with 113 additions and 0 deletions
CoreNLP/bin/filtering.py
--- a/CoreNLP/bin/filtering.py 0 → 100644
View file @ad2a3c1
+++ b/CoreNLP/bin/filtering.py 0 → 100644
View file @ad2a3c1
+ # Importacion de librerias
+ import pandas as pd
+ import re
+ import argparse
+ import os
+ 
+ __author__ = 'kevinml'
+ 
+ # Objective 
+ # Take two column files and make 3 different files:
+ # 1.- <FileName>_1Word_NoGreek.txt - Archivo donde la primer columna es unipalabra y SOLO contiene numeros alfanumericos.
+ # 2.- <FileName>_Words_NoGreek.tx - Archivo donde la primer columna es multipalabra y SOLO contiene numeros alfanumericos.
+ # 3.- <FileName>_1Word_Greek.txt - Archivo donde la primer columna es unipalabra y contiene caracteres NO alfanumericos.
+ # 
+ # Input parameters 
+ # --inputPath=PATH     Path of inputfiles.
+ # --outputPath=PATH    Path of outputfiles. 
+ # --iFile              Archivo a partir del cual se obtendran los 3 archivos. 
+ # 
+ # Output 
+ # 
+ # 
+ # Examples 
+ # python filtering.py --inputPath /home/kevinml/Dropbox/LCG/Labs/PGC/automatic-extraction-GEO --outputPath /home/kevinml/Dropbox/LCG/Labs/PGC/automatic-extraction-GEO --iFile NER_words.txt
+ 
+ #################################################################################### 
+ #                                    FUNCTIONS                                     # 
+ ####################################################################################
+ 
+ def alphanum_and_NOGreek(word):     
+     ''' Esta funcion regresa True si en la palabra que recibe como parametro NO SE ENCUENTRAN CARACTERES ALFANUMERICOS
+     y regresa False si en la palabra se encuentra algún caracter NO ALFANUMERICO
+     '''
+     if re.search("\W", word):  
+         return False    
+     else:
+         return True   
+ 
+ #################################################################################### 
+ #                                   MAIN PROGRAM                                  # 
+ ####################################################################################
+ 
+ if __name__ == '__main__':
+ 
+     # Definicion de Parametros
+     parser = argparse.ArgumentParser()     
+     parser.add_argument('--inputPath', help="Ruta donde se encuentra el archivo a procesar. Ej: --inputPath /home/kevinml/transporter-substrate-interactions/", required=True)     
+     parser.add_argument('--outputPath', help="Ruta donde se depositaran los archivos resultantes. Ej: --outputPath /home/kevinml/transporter-substrate-interactions/", required=True)     
+     parser.add_argument('--iFile', help="Archivo a procesar. Ej: --iFile NER_words.txt", required=True)     
+     args = parser.parse_args()     
+ 
+     # Se imprimen los parametros ingresados    
+     print('\n-------------------------------- PARAMETERS --------------------------------\n')         
+     print('Input Path: ' + str(args.inputPath))
+     print('File: ' + str(args.iFile))                  
+     print('Output Path: ' + str(args.outputPath))         
+     print('\n-------------------------------- PROCESSING --------------------------------\n')     
+ 
+     # Se abre el archivo a procesar
+     file = pd.read_csv(os.path.join(args.inputPath, args.iFile), sep = "\t")
+ 
+     print("######################\n#   PRIMER ARCHIVO   #\n######################")
+     conditions = []
+     lines = []
+     # Se abre el primer archivo
+     with open (os.path.join(args.outputPath,str(args.iFile[:-4]) + "_1Word_NoGreek.txt"), "w+") as oFile:
+         for index, row in file.iterrows():
+             if len(row[0].split(" ")) == 1 and alphanum_and_NOGreek(str(row[0].split(" ")[0])) == True: # Se verifica que en la primer columna solo haya un palabra y que esta solo tenga caracteres alfanumericos.
+                 conditions.append(row[0])
+                 lines.append(row[1])
+         # Se escriben en el primer archivo aquellos valores que cumplen las condiciones.
+         for i in range(len(lines)):
+             oFile.write(conditions[i] + "\t" + lines[i] + '\n')
+ 
+     print("\nArchivo de contenidos de una sola palabra ha sido generado. NOTA: Se han excluido letras griegas.\nNombre del archivo:" + str(args.iFile[:-4]) + "_1Word_NoGreek.txt\n")
+ 
+     print("#######################\n#   SEGUNDO ARCHIVO   #\n#######################")
+     conditions_2 = []
+     lines_2 = []
+     # Se abre el segundo archivo
+     with open (os.path.join(args.outputPath,str(args.iFile[:-4]) + "_Words_NoGreek.txt"), "w+") as oFile:
+         for index, row in file.iterrows():
+             # La bandera en 1 indica que ninguna palabra de la primer columna tiene caracteres NO alfanumericos
+             # La bandera en 0 indica que al menos una palabra tienes caracteres NO alfanumericos. 
+             bandera = 1
+             # Con el for se va a verificando la presencia de caracteres alfanumericos en cada palabra de la primera columna
+             for i in range(0, len(row[0].split(" "))):
+                 if alphanum_and_NOGreek(str(row[0].split(" ")[i])) == False:
+                     bandera = 0
+             if bandera == 1:
+                 conditions_2.append(row[0])
+                 lines_2.append(row[1]) 
+         # Se escriben en el primer archivo aquellos valores que cumplen las condiciones.
+         for i in range(len(lines_2)):
+             oFile.write(conditions_2[i] + "\t" + lines_2[i] + '\n')
+ 
+     print("\nArchivo de contenidos de varias palabras ha sido generado. NOTA: Se han excluido letras griegas.\nNombre del archivo:" + str(args.iFile[:-4]) + "SeveralWords_NoGreek_Filter\n")
+ 
+     print("######################\n#   TERCER ARCHIVO   #\n######################")
+     conditions_3 = []
+     lines_3 = []
+     # Se abre el tercer archivo
+     with open (os.path.join(args.outputPath,str(args.iFile[:-4]) + "_1Word_Greek.txt"), "w+") as oFile:
+         for index, row in file.iterrows():
+             # Se verifica que la primer columna sea unipalabra.
+             if len(row[0].split(" ")) == 1:
+                 conditions_3.append(row[0])
+                 lines_3.append(row[1])           
+         # Se escriben en el primer archivo aquellos valores que cumplen las condiciones.
+         for i in range(len(lines_3)):
+             oFile.write(conditions_3[i] + "\t" + lines_3[i] + '\n')
+ 
+     print("\nArchivo de contenidos de una palabra ha sido generado:.nNombre del archivo:" + str(args.iFile[:-4]) + "SeveralWords_Greek_Filter\n")
\ No newline at end of file