Last active
August 11, 2019 04:50
-
-
Save skyer9/afd34970912106ac2b7c1f87557771c9 to your computer and use it in GitHub Desktop.
TF IDF with amazon review dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import pyspark | |
from pyspark.sql import SQLContext | |
# +--------------+---------+------------------+ | |
# | doc_id| token| tfidf| | |
# +--------------+---------+------------------+ | |
# |R13E0LFYMKV62T| |12.613611576441143| | |
# |R1QYQ8ZJ9LS8L9| high|11.634883586257445| | |
# |R1QYQ8ZJ9LS8L9| flies| 9.942558014017578| | |
# |R162SXU36VVE3N| toogoo| 9.942558014017578| | |
# |R1QYQ8ZJ9LS8L9| so| 9.656627474604601| | |
# |R2HZQ7EE1JKASF| br| 9.591581091193483| | |
# |R2E07B9J2KB60P| only| 8.726162689693084| | |
# |R2TCCMZE9ME0JS|perfectly| 7.863116472337741| | |
# |R2TCCMZE9ME0JS| as|7.2404344315850055| | |
# | RN1CRPKNIX49D| ive| 7.193685818395112| | |
# |R3V50MBK65GGKT| line| 6.818992368953701| | |
# |R3E4XGTZ5NC4QC| all| 6.646721148013247| | |
# |R3MK80M8WDXAX5| without| 6.646721148013247| | |
# |R13E0LFYMKV62T| across|6.6283720093450516| | |
# |R3GRH2J37F8FE7| company|6.6283720093450516| | |
# |R205FXUV0AWV3H| each|6.6283720093450516| | |
# |R2RCEVGMYOW5D7| havnt|6.6283720093450516| | |
# |R2TCCMZE9ME0JS| her|6.6283720093450516| | |
# |R1QYQ8ZJ9LS8L9| highbr|6.6283720093450516| | |
# |R2HZQ7EE1JKASF| metal|6.6283720093450516| | |
# +--------------+---------+------------------+ | |
if __name__ == '__main__': | |
sc = pyspark.SparkContext.getOrCreate() | |
sqlContext = SQLContext(sc) | |
# df = sqlContext.createDataFrame([ | |
# (0, 'bbb ccc1 fff, ggg. hhh?'), | |
# (1, 'aa bbb bbb ccc ddd, d12.'), | |
# (2, 'aaa bb eee'), | |
# ], ['doc_id', 'doc_str']) | |
# df.createOrReplaceTempView('tmp_view') | |
# sql = """ | |
# SELECT | |
# regexp_replace(lower(doc_str), '[^a-z ]', '') as doc_str | |
# FROM | |
# tmp_view | |
# """ | |
# doc = sqlContext.sql(sql) | |
# doc.show() | |
# 출처 : https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt | |
df = sqlContext.read.load('hdfs://172.31.29.43:9000/amazon_reviews_us_Toys_v1_00.tsv.gz', format='csv', sep='\t', header='true') | |
df.createOrReplaceTempView('tmp_reviewdata') | |
sqls = """ | |
CREATE OR REPLACE TEMPORARY VIEW tmp_ratingdata AS | |
SELECT | |
review_id as doc_id, regexp_replace(lower(review_body), '[^a-z ]', '') as doc_str | |
FROM | |
tmp_reviewdata | |
WHERE | |
product_parent = '100005735'; | |
CREATE OR REPLACE TEMPORARY VIEW tmp_rawdata AS | |
SELECT doc_id, doc_str, explode(split(doc_str, ' ')) as token | |
FROM tmp_ratingdata; | |
CREATE OR REPLACE TEMPORARY VIEW tmp_tf AS | |
SELECT doc_id, token, count(*) as tf | |
FROM tmp_rawdata | |
GROUP BY doc_id, token; | |
CREATE OR REPLACE TEMPORARY VIEW tmp_df AS | |
SELECT | |
token, | |
count(distinct doc_id) as df, | |
( | |
SELECT count(distinct doc_id) as d | |
FROM tmp_rawdata | |
) d | |
FROM | |
tmp_rawdata R | |
GROUP BY token, d; | |
CREATE OR REPLACE TEMPORARY VIEW tmp_idf AS | |
SELECT | |
token, | |
log((d + 1) / (df + 1)) as idf | |
FROM | |
tmp_df; | |
SELECT | |
T.doc_id, | |
T.token, | |
(T.tf * I.idf) as tfidf | |
FROM | |
tmp_tf T | |
JOIN tmp_idf I ON T.token = I.token | |
ORDER BY | |
tfidf DESC, T.token, T.doc_id; | |
""" | |
sqlarray = sqls.split(';') | |
result = None | |
for i in range(len(sqlarray)): | |
sql = sqlarray[i].rstrip() | |
if sql != '': | |
result = sqlContext.sql(sql) | |
result.show() | |
sc.stop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
temp table, multi query 기능 부재로 저렇게 지저분하게 코딩했는데...
조만간 기능추가가 되지 않을까 합니다.