Carlos-Francisco Méndez-Cruz

Feature extraction and vectorizer three sentences

...@@ -19,16 +19,17 @@ __author__ = 'CMendezC' ...@@ -19,16 +19,17 @@ __author__ = 'CMendezC'
19 # 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf. 19 # 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf.
20 20
21 # Ouput: 21 # Ouput:
22 -# 1) Files with vectors. 22 +# 1) Report with dictionary, vectors, cosine similarity matrix.
23 23
24 # Execution: 24 # Execution:
25 # python extraccion-caracteristicas-vectorizacion.py 25 # python extraccion-caracteristicas-vectorizacion.py
26 # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences 26 # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences
27 # --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences 27 # --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences
28 # --vectorizer b 28 # --vectorizer b
29 +# --feature word
29 30
30 # source activate python3 31 # source activate python3
31 -# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b 32 +# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b --feature word
32 33
33 ########################################################### 34 ###########################################################
34 # MAIN PROGRAM # 35 # MAIN PROGRAM #
...@@ -44,6 +45,9 @@ if __name__ == "__main__": ...@@ -44,6 +45,9 @@ if __name__ == "__main__":
44 parser.add_argument("--vectorizer", dest="vectorizer", required=True, 45 parser.add_argument("--vectorizer", dest="vectorizer", required=True,
45 help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", 46 help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR",
46 choices=('b', 'f', 't'), default='b') 47 choices=('b', 'f', 't'), default='b')
48 + parser.add_argument("--feature", dest="feature", required=True,
49 + help="Feature: word, lemma, pos", metavar="TEXT",
50 + choices=('word', 'lemma', 'pos'), default='b')
47 51
48 args = parser.parse_args() 52 args = parser.parse_args()
49 53
...@@ -61,6 +65,7 @@ if __name__ == "__main__": ...@@ -61,6 +65,7 @@ if __name__ == "__main__":
61 # Read documents from input path 65 # Read documents from input path
62 for path, dirs, files in os.walk(args.inputPath): 66 for path, dirs, files in os.walk(args.inputPath):
63 for file in files: 67 for file in files:
68 + if file.endswith(args.feature):
64 with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile: 69 with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile:
65 print("...{}".format(file)) 70 print("...{}".format(file))
66 # Add file to document list 71 # Add file to document list
...@@ -85,7 +90,7 @@ if __name__ == "__main__": ...@@ -85,7 +90,7 @@ if __name__ == "__main__":
85 similarityMatrix = cosine_similarity(matrix) 90 similarityMatrix = cosine_similarity(matrix)
86 print(" Cosine similarity matrix shape: {}".format(similarityMatrix.shape)) 91 print(" Cosine similarity matrix shape: {}".format(similarityMatrix.shape))
87 92
88 - with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile: 93 + with open(os.path.join(args.outputPath, "report-vectorizer.{}.{}.txt".format(args.feature, args.vectorizer)), encoding="utf-8", mode="w") as oFile:
89 oFile.write("Vectorizer: {}\n".format(args.vectorizer)) 94 oFile.write("Vectorizer: {}\n".format(args.vectorizer))
90 oFile.write(str(vectorizer.get_feature_names())) 95 oFile.write(str(vectorizer.get_feature_names()))
91 oFile.write("\n") 96 oFile.write("\n")
......