Showing
1 changed file
with
12 additions
and
7 deletions
| ... | @@ -19,16 +19,17 @@ __author__ = 'CMendezC' | ... | @@ -19,16 +19,17 @@ __author__ = 'CMendezC' |
| 19 | # 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf. | 19 | # 3) --vectorizer Vectorizer: b=binary, f=frequency, t=tf-idf. |
| 20 | 20 | ||
| 21 | # Ouput: | 21 | # Ouput: |
| 22 | -# 1) Files with vectors. | 22 | +# 1) Report with dictionary, vectors, cosine similarity matrix. |
| 23 | 23 | ||
| 24 | # Execution: | 24 | # Execution: |
| 25 | # python extraccion-caracteristicas-vectorizacion.py | 25 | # python extraccion-caracteristicas-vectorizacion.py |
| 26 | # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences | 26 | # --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences |
| 27 | # --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences | 27 | # --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences |
| 28 | # --vectorizer b | 28 | # --vectorizer b |
| 29 | +# --feature word | ||
| 29 | 30 | ||
| 30 | # source activate python3 | 31 | # source activate python3 |
| 31 | -# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b | 32 | +# python extraccion-caracteristicas-vectorizacion.py --inputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/data-set-three-sentences --outputPath /home/compu2/bionlp/lcg-bioinfoI-bionlp/representaciones-vectoriales/reports-three-sentences --vectorizer b --feature word |
| 32 | 33 | ||
| 33 | ########################################################### | 34 | ########################################################### |
| 34 | # MAIN PROGRAM # | 35 | # MAIN PROGRAM # |
| ... | @@ -44,6 +45,9 @@ if __name__ == "__main__": | ... | @@ -44,6 +45,9 @@ if __name__ == "__main__": |
| 44 | parser.add_argument("--vectorizer", dest="vectorizer", required=True, | 45 | parser.add_argument("--vectorizer", dest="vectorizer", required=True, |
| 45 | help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", | 46 | help="Vectorizer: b=binary, f=frequency, t=tf-idf", metavar="CHAR", |
| 46 | choices=('b', 'f', 't'), default='b') | 47 | choices=('b', 'f', 't'), default='b') |
| 48 | + parser.add_argument("--feature", dest="feature", required=True, | ||
| 49 | + help="Feature: word, lemma, pos", metavar="TEXT", | ||
| 50 | + choices=('word', 'lemma', 'pos'), default='b') | ||
| 47 | 51 | ||
| 48 | args = parser.parse_args() | 52 | args = parser.parse_args() |
| 49 | 53 | ||
| ... | @@ -61,10 +65,11 @@ if __name__ == "__main__": | ... | @@ -61,10 +65,11 @@ if __name__ == "__main__": |
| 61 | # Read documents from input path | 65 | # Read documents from input path |
| 62 | for path, dirs, files in os.walk(args.inputPath): | 66 | for path, dirs, files in os.walk(args.inputPath): |
| 63 | for file in files: | 67 | for file in files: |
| 64 | - with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile: | 68 | + if file.endswith(args.feature): |
| 65 | - print("...{}".format(file)) | 69 | + with open(os.path.join(args.inputPath, file), mode="r", encoding="utf-8") as iFile: |
| 66 | - # Add file to document list | 70 | + print("...{}".format(file)) |
| 67 | - documents.append(iFile.read()) | 71 | + # Add file to document list |
| 72 | + documents.append(iFile.read()) | ||
| 68 | print(" Documents: {}".format(len(documents))) | 73 | print(" Documents: {}".format(len(documents))) |
| 69 | 74 | ||
| 70 | # Create vectorizer | 75 | # Create vectorizer |
| ... | @@ -85,7 +90,7 @@ if __name__ == "__main__": | ... | @@ -85,7 +90,7 @@ if __name__ == "__main__": |
| 85 | similarityMatrix = cosine_similarity(matrix) | 90 | similarityMatrix = cosine_similarity(matrix) |
| 86 | print(" Cosine similarity matrix shape: {}".format(similarityMatrix.shape)) | 91 | print(" Cosine similarity matrix shape: {}".format(similarityMatrix.shape)) |
| 87 | 92 | ||
| 88 | - with open(os.path.join(args.outputPath, "report-vectorizer.{}.txt".format(args.vectorizer)), encoding="utf-8", mode="w") as oFile: | 93 | + with open(os.path.join(args.outputPath, "report-vectorizer.{}.{}.txt".format(args.feature, args.vectorizer)), encoding="utf-8", mode="w") as oFile: |
| 89 | oFile.write("Vectorizer: {}\n".format(args.vectorizer)) | 94 | oFile.write("Vectorizer: {}\n".format(args.vectorizer)) |
| 90 | oFile.write(str(vectorizer.get_feature_names())) | 95 | oFile.write(str(vectorizer.get_feature_names())) |
| 91 | oFile.write("\n") | 96 | oFile.write("\n") | ... | ... |
-
Please register or login to post a comment