Showing
1 changed file
with
8 additions
and
3 deletions
... | @@ -19,16 +19,17 @@ __author__ = 'CMendezC' | ... | @@ -19,16 +19,17 @@ __author__ = 'CMendezC' |
19 | # 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf. | 19 | # 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf. |
20 | 20 | ||
21 | # Ouput: | 21 | # Ouput: |
22 | -# 1) Files with vectors. | 22 | +# 1) Report with dictionary, vectors, cosine similarity matrix. |
23 | 23 | ||
24 | # Execution: | 24 | # Execution: |
25 | # python extraccion-caracteristicas-vectorizacion.py | 25 | # python extraccion-caracteristicas-vectorizacion.py |
26 | # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences | 26 | # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences |
27 | # --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences | 27 | # --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences |
28 | # --vectorizer b | 28 | # --vectorizer b |
29 | +# --feature word | ||
29 | 30 | ||
30 | # source activate python3 | 31 | # source activate python3 |
31 | -# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b | 32 | +# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b --feature word |
32 | 33 | ||
33 | ########################################################### | 34 | ########################################################### |
34 | # MAIN PROGRAM # | 35 | # MAIN PROGRAM # |
... | @@ -44,6 +45,9 @@ if __name__ == "__main__": | ... | @@ -44,6 +45,9 @@ if __name__ == "__main__": |
44 | parser.add_argument("--vectorizer", dest="vectorizer", required=True, | 45 | parser.add_argument("--vectorizer", dest="vectorizer", required=True, |
45 | help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", | 46 | help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", |
46 | choices=('b', 'f', 't'), default='b') | 47 | choices=('b', 'f', 't'), default='b') |
48 | + parser.add_argument("--feature", dest="feature", required=True, | ||
49 | + help="Feature: word, lemma, pos", metavar="TEXT", | ||
50 | + choices=('word', 'lemma', 'pos'), default='b') | ||
47 | 51 | ||
48 | args = parser.parse_args() | 52 | args = parser.parse_args() |
49 | 53 | ||
... | @@ -61,6 +65,7 @@ if __name__ == "__main__": | ... | @@ -61,6 +65,7 @@ if __name__ == "__main__": |
61 | # Read documents from input path | 65 | # Read documents from input path |
62 | for path, dirs, files in os.walk(args.inputPath): | 66 | for path, dirs, files in os.walk(args.inputPath): |
63 | for file in files: | 67 | for file in files: |
68 | + if file.endswith(args.feature): | ||
64 | with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile: | 69 | with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile: |
65 | print("...{}".format(file)) | 70 | print("...{}".format(file)) |
66 | # Add file to document list | 71 | # Add file to document list |
... | @@ -85,7 +90,7 @@ if __name__ == "__main__": | ... | @@ -85,7 +90,7 @@ if __name__ == "__main__": |
85 | similarityMatrix = cosine_similarity(matrix) | 90 | similarityMatrix = cosine_similarity(matrix) |
86 | print(" Cosine similarity matrix shape: {}".format(similarityMatrix.shape)) | 91 | print(" Cosine similarity matrix shape: {}".format(similarityMatrix.shape)) |
87 | 92 | ||
88 | - with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile: | 93 | + with open(os.path.join(args.outputPath, "report-vectorizer.{}.{}.txt".format(args.feature, args.vectorizer)), encoding="utf-8", mode="w") as oFile: |
89 | oFile.write("Vectorizer: {}\n".format(args.vectorizer)) | 94 | oFile.write("Vectorizer: {}\n".format(args.vectorizer)) |
90 | oFile.write(str(vectorizer.get_feature_names())) | 95 | oFile.write(str(vectorizer.get_feature_names())) |
91 | oFile.write("\n") | 96 | oFile.write("\n") | ... | ... |
-
Please register or login to post a comment