Created
June 20, 2018 21:08
-
-
Save p-baleine/bbdc0691d0d0c2ac377142414fe2ee11 to your computer and use it in GitHub Desktop.
読書メモ〜『情報検索の基礎』 第1章 2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import os | |
import re | |
import pandas as pd | |
import string | |
def tokenize(line): | |
return line.split() | |
def normalize(line): | |
table = line.maketrans('', '', string.punctuation) | |
return line.translate(table) | |
def create_incidence_matrix( | |
document_paths, | |
tokenize=tokenize, | |
normalize=normalize): | |
"""Create incidence matrix from texts under document paths. | |
""" | |
docs = [] | |
# Collecting terms. | |
for p in document_paths: | |
terms = [] | |
for txt in glob.glob(os.path.join(p, '*.txt')): | |
with open(txt) as f: | |
terms.append(sum( | |
[tokenize(normalize(l)) for l in f], [])) | |
docs.append(list(set(sum(terms, [])))) | |
# Create dictionary. | |
vocab = [w for w in set(sum(docs, []))] | |
# Create matrix. | |
data = dict((os.path.basename(p), | |
dict((w, 1 if w in doc else 0) for w in vocab)) | |
for p, doc in zip(document_paths, docs)) | |
return pd.DataFrame(data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment