Script para generar archivos para CoreNLP (regex NER).

Kevin Meza Landeros
Commit ad2a3c1997c42a9955a8f3085f8a346e4b2cfa7f ad2a3c19 1 parent 3053afd6
Showing 1 changed file with 113 additions and 0 deletions
CoreNLP/bin/filtering.py
--- a/CoreNLP/bin/filtering.py 0 → 100644
View file @ad2a3c1
+++ b/CoreNLP/bin/filtering.py 0 → 100644
View file @ad2a3c1
+# Importacion de librerias
+import pandas as pd
+import re
+import argparse
+import os
+
+__author__ = 'kevinml'
+
+# Objective 
+# Take two column files and make 3 different files:
+# 1.- <FileName>_1Word_NoGreek.txt - Archivo donde la primer columna es unipalabra y SOLO contiene numeros alfanumericos.
+# 2.- <FileName>_Words_NoGreek.tx - Archivo donde la primer columna es multipalabra y SOLO contiene numeros alfanumericos.
+# 3.- <FileName>_1Word_Greek.txt - Archivo donde la primer columna es unipalabra y contiene caracteres NO alfanumericos.
+# 
+# Input parameters 
+# --inputPath=PATH     Path of inputfiles.
+# --outputPath=PATH    Path of outputfiles. 
+# --iFile              Archivo a partir del cual se obtendran los 3 archivos. 
+# 
+# Output 
+# 
+# 
+# Examples 
+# python filtering.py --inputPath /home/kevinml/Dropbox/LCG/Labs/PGC/automatic-extraction-GEO --outputPath /home/kevinml/Dropbox/LCG/Labs/PGC/automatic-extraction-GEO --iFile NER_words.txt
+
+#################################################################################### 
+#                                    FUNCTIONS                                     # 
+####################################################################################
+
+def alphanum_and_NOGreek(word):     
+    ''' Esta funcion regresa True si en la palabra que recibe como parametro NO SE ENCUENTRAN CARACTERES ALFANUMERICOS
+    y regresa False si en la palabra se encuentra algún caracter NO ALFANUMERICO
+    '''
+    if re.search("\W", word):  
+        return False    
+    else:
+        return True   
+
+#################################################################################### 
+#                                   MAIN PROGRAM                                  # 
+####################################################################################
+
+if __name__ == '__main__':
+
+    # Definicion de Parametros
+    parser = argparse.ArgumentParser()     
+    parser.add_argument('--inputPath', help="Ruta donde se encuentra el archivo a procesar. Ej: --inputPath /home/kevinml/transporter-substrate-interactions/", required=True)     
+    parser.add_argument('--outputPath', help="Ruta donde se depositaran los archivos resultantes. Ej: --outputPath /home/kevinml/transporter-substrate-interactions/", required=True)     
+    parser.add_argument('--iFile', help="Archivo a procesar. Ej: --iFile NER_words.txt", required=True)     
+    args = parser.parse_args()     
+
+    # Se imprimen los parametros ingresados    
+    print('\n-------------------------------- PARAMETERS --------------------------------\n')         
+    print('Input Path: ' + str(args.inputPath))
+    print('File: ' + str(args.iFile))                  
+    print('Output Path: ' + str(args.outputPath))         
+    print('\n-------------------------------- PROCESSING --------------------------------\n')     
+
+    # Se abre el archivo a procesar
+    file = pd.read_csv(os.path.join(args.inputPath, args.iFile), sep = "\t")
+
+    print("######################\n#   PRIMER ARCHIVO   #\n######################")
+    conditions = []
+    lines = []
+    # Se abre el primer archivo
+    with open (os.path.join(args.outputPath,str(args.iFile[:-4]) + "_1Word_NoGreek.txt"), "w+") as oFile:
+        for index, row in file.iterrows():
+            if len(row[0].split(" ")) == 1 and alphanum_and_NOGreek(str(row[0].split(" ")[0])) == True: # Se verifica que en la primer columna solo haya un palabra y que esta solo tenga caracteres alfanumericos.
+                conditions.append(row[0])
+                lines.append(row[1])
+        # Se escriben en el primer archivo aquellos valores que cumplen las condiciones.
+        for i in range(len(lines)):
+            oFile.write(conditions[i] + "\t" + lines[i] + '\n')
+
+    print("\nArchivo de contenidos de una sola palabra ha sido generado. NOTA: Se han excluido letras griegas.\nNombre del archivo:" + str(args.iFile[:-4]) + "_1Word_NoGreek.txt\n")
+
+    print("#######################\n#   SEGUNDO ARCHIVO   #\n#######################")
+    conditions_2 = []
+    lines_2 = []
+    # Se abre el segundo archivo
+    with open (os.path.join(args.outputPath,str(args.iFile[:-4]) + "_Words_NoGreek.txt"), "w+") as oFile:
+        for index, row in file.iterrows():
+            # La bandera en 1 indica que ninguna palabra de la primer columna tiene caracteres NO alfanumericos
+            # La bandera en 0 indica que al menos una palabra tienes caracteres NO alfanumericos. 
+            bandera = 1
+            # Con el for se va a verificando la presencia de caracteres alfanumericos en cada palabra de la primera columna
+            for i in range(0, len(row[0].split(" "))):
+                if alphanum_and_NOGreek(str(row[0].split(" ")[i])) == False:
+                    bandera = 0
+            if bandera == 1:
+                conditions_2.append(row[0])
+                lines_2.append(row[1]) 
+        # Se escriben en el primer archivo aquellos valores que cumplen las condiciones.
+        for i in range(len(lines_2)):
+            oFile.write(conditions_2[i] + "\t" + lines_2[i] + '\n')
+
+    print("\nArchivo de contenidos de varias palabras ha sido generado. NOTA: Se han excluido letras griegas.\nNombre del archivo:" + str(args.iFile[:-4]) + "SeveralWords_NoGreek_Filter\n")
+
+    print("######################\n#   TERCER ARCHIVO   #\n######################")
+    conditions_3 = []
+    lines_3 = []
+    # Se abre el tercer archivo
+    with open (os.path.join(args.outputPath,str(args.iFile[:-4]) + "_1Word_Greek.txt"), "w+") as oFile:
+        for index, row in file.iterrows():
+            # Se verifica que la primer columna sea unipalabra.
+            if len(row[0].split(" ")) == 1:
+                conditions_3.append(row[0])
+                lines_3.append(row[1])           
+        # Se escriben en el primer archivo aquellos valores que cumplen las condiciones.
+        for i in range(len(lines_3)):
+            oFile.write(conditions_3[i] + "\t" + lines_3[i] + '\n')
+
+    print("\nArchivo de contenidos de una palabra ha sido generado:.nNombre del archivo:" + str(args.iFile[:-4]) + "SeveralWords_Greek_Filter\n")
\ No newline at end of file