skyer9 · August 11, 2019 04:50 · skyer9 · Aug 10, 2019
diff --git a/tfidf.py b/tfidf.py
 # -*- coding: utf-8 -*-
 import pyspark
 from pyspark.sql import SQLContext


 # +--------------+---------+------------------+
 # |        doc_id|    token|             tfidf|
 # +--------------+---------+------------------+
 # |R13E0LFYMKV62T|         |12.613611576441143|
 # |R1QYQ8ZJ9LS8L9|     high|11.634883586257445|
 # |R1QYQ8ZJ9LS8L9|    flies| 9.942558014017578|
 # |R162SXU36VVE3N|   toogoo| 9.942558014017578|
 # |R1QYQ8ZJ9LS8L9|       so| 9.656627474604601|
 # |R2HZQ7EE1JKASF|       br| 9.591581091193483|
 # |R2E07B9J2KB60P|     only| 8.726162689693084|
 # |R2TCCMZE9ME0JS|perfectly| 7.863116472337741|
 # |R2TCCMZE9ME0JS|       as|7.2404344315850055|
 # | RN1CRPKNIX49D|      ive| 7.193685818395112|
 # |R3V50MBK65GGKT|     line| 6.818992368953701|
 # |R3E4XGTZ5NC4QC|      all| 6.646721148013247|
 # |R3MK80M8WDXAX5|  without| 6.646721148013247|
 # |R13E0LFYMKV62T|   across|6.6283720093450516|
 # |R3GRH2J37F8FE7|  company|6.6283720093450516|
 # |R205FXUV0AWV3H|     each|6.6283720093450516|
 # |R2RCEVGMYOW5D7|    havnt|6.6283720093450516|
 # |R2TCCMZE9ME0JS|      her|6.6283720093450516|
 # |R1QYQ8ZJ9LS8L9|   highbr|6.6283720093450516|
 # |R2HZQ7EE1JKASF|    metal|6.6283720093450516|
 # +--------------+---------+------------------+


 if __name__ == '__main__':
    sc = pyspark.SparkContext.getOrCreate()

    sqlContext = SQLContext(sc)

    # df = sqlContext.createDataFrame([
    #     (0, 'bbb ccc1 fff, ggg. hhh?'),
    #     (1, 'aa bbb bbb ccc ddd, d12.'),
    #     (2, 'aaa bb eee'),
    # ], ['doc_id', 'doc_str'])

    # df.createOrReplaceTempView('tmp_view')

    # sql = """
    #     SELECT
    #         regexp_replace(lower(doc_str), '[^a-z ]', '') as doc_str
    #     FROM
    #         tmp_view
    # """

    # doc = sqlContext.sql(sql)

    # doc.show()

    # 출처 : https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt
    df = sqlContext.read.load('hdfs://172.31.29.43:9000/amazon_reviews_us_Toys_v1_00.tsv.gz', format='csv', sep='\t', header='true')

    df.createOrReplaceTempView('tmp_reviewdata')

    sqls = """
        CREATE OR REPLACE TEMPORARY VIEW tmp_ratingdata AS
        SELECT
            review_id as doc_id, regexp_replace(lower(review_body), '[^a-z ]', '') as doc_str
        FROM
             tmp_reviewdata
        WHERE
            product_parent = '100005735';

        CREATE OR REPLACE TEMPORARY VIEW tmp_rawdata AS
        SELECT doc_id, doc_str, explode(split(doc_str, ' ')) as token
        FROM tmp_ratingdata;

        CREATE OR REPLACE TEMPORARY VIEW tmp_tf AS
        SELECT doc_id, token, count(*) as tf
        FROM tmp_rawdata
        GROUP BY doc_id, token;

        CREATE OR REPLACE TEMPORARY VIEW tmp_df AS
        SELECT
            token,
            count(distinct doc_id) as df,
            (
                SELECT count(distinct doc_id) as d
                FROM tmp_rawdata
            ) d
        FROM
            tmp_rawdata R
        GROUP BY token, d;

        CREATE OR REPLACE TEMPORARY VIEW tmp_idf AS
        SELECT
            token,
            log((d + 1) / (df + 1)) as idf
        FROM
            tmp_df;

        SELECT
            T.doc_id,
            T.token,
            (T.tf * I.idf) as tfidf
        FROM
            tmp_tf T
            JOIN tmp_idf I ON T.token = I.token
        ORDER BY
            tfidf DESC, T.token, T.doc_id;
    """
    sqlarray = sqls.split(';')

    result = None
    for i in range(len(sqlarray)):
        sql = sqlarray[i].rstrip()
        if sql != '':
            result = sqlContext.sql(sql)

    result.show()

    sc.stop()
	# -- coding: utf-8 --
	import pyspark
	from pyspark.sql import SQLContext


	# +--------------+---------+------------------+
	# \| doc_id\| token\| tfidf\|
	# +--------------+---------+------------------+
	# \|R13E0LFYMKV62T\| \|12.613611576441143\|
	# \|R1QYQ8ZJ9LS8L9\| high\|11.634883586257445\|
	# \|R1QYQ8ZJ9LS8L9\| flies\| 9.942558014017578\|
	# \|R162SXU36VVE3N\| toogoo\| 9.942558014017578\|
	# \|R1QYQ8ZJ9LS8L9\| so\| 9.656627474604601\|
	# \|R2HZQ7EE1JKASF\| br\| 9.591581091193483\|
	# \|R2E07B9J2KB60P\| only\| 8.726162689693084\|
	# \|R2TCCMZE9ME0JS\|perfectly\| 7.863116472337741\|
	# \|R2TCCMZE9ME0JS\| as\|7.2404344315850055\|
	# \| RN1CRPKNIX49D\| ive\| 7.193685818395112\|
	# \|R3V50MBK65GGKT\| line\| 6.818992368953701\|
	# \|R3E4XGTZ5NC4QC\| all\| 6.646721148013247\|
	# \|R3MK80M8WDXAX5\| without\| 6.646721148013247\|
	# \|R13E0LFYMKV62T\| across\|6.6283720093450516\|
	# \|R3GRH2J37F8FE7\| company\|6.6283720093450516\|
	# \|R205FXUV0AWV3H\| each\|6.6283720093450516\|
	# \|R2RCEVGMYOW5D7\| havnt\|6.6283720093450516\|
	# \|R2TCCMZE9ME0JS\| her\|6.6283720093450516\|
	# \|R1QYQ8ZJ9LS8L9\| highbr\|6.6283720093450516\|
	# \|R2HZQ7EE1JKASF\| metal\|6.6283720093450516\|
	# +--------------+---------+------------------+


	if __name__ == '__main__':
	sc = pyspark.SparkContext.getOrCreate()

	sqlContext = SQLContext(sc)

	# df = sqlContext.createDataFrame([
	# (0, 'bbb ccc1 fff, ggg. hhh?'),
	# (1, 'aa bbb bbb ccc ddd, d12.'),
	# (2, 'aaa bb eee'),
	# ], ['doc_id', 'doc_str'])

	# df.createOrReplaceTempView('tmp_view')

	# sql = """
	# SELECT
	# regexp_replace(lower(doc_str), '[^a-z ]', '') as doc_str
	# FROM
	# tmp_view
	# """

	# doc = sqlContext.sql(sql)

	# doc.show()

	# 출처 : https://s3.amazonaws.com/amazon-reviews-pds/tsv/index.txt
	df = sqlContext.read.load('hdfs://172.31.29.43:9000/amazon_reviews_us_Toys_v1_00.tsv.gz', format='csv', sep='\t', header='true')

	df.createOrReplaceTempView('tmp_reviewdata')

	sqls = """
	CREATE OR REPLACE TEMPORARY VIEW tmp_ratingdata AS
	SELECT
	review_id as doc_id, regexp_replace(lower(review_body), '[^a-z ]', '') as doc_str
	FROM
	tmp_reviewdata
	WHERE
	product_parent = '100005735';

	CREATE OR REPLACE TEMPORARY VIEW tmp_rawdata AS
	SELECT doc_id, doc_str, explode(split(doc_str, ' ')) as token
	FROM tmp_ratingdata;

	CREATE OR REPLACE TEMPORARY VIEW tmp_tf AS
	SELECT doc_id, token, count(*) as tf
	FROM tmp_rawdata
	GROUP BY doc_id, token;

	CREATE OR REPLACE TEMPORARY VIEW tmp_df AS
	SELECT
	token,
	count(distinct doc_id) as df,
	(
	SELECT count(distinct doc_id) as d
	FROM tmp_rawdata
	) d
	FROM
	tmp_rawdata R
	GROUP BY token, d;

	CREATE OR REPLACE TEMPORARY VIEW tmp_idf AS
	SELECT
	token,
	log((d + 1) / (df + 1)) as idf
	FROM
	tmp_df;

	SELECT
	T.doc_id,
	T.token,
	(T.tf * I.idf) as tfidf
	FROM
	tmp_tf T
	JOIN tmp_idf I ON T.token = I.token
	ORDER BY
	tfidf DESC, T.token, T.doc_id;
	"""
	sqlarray = sqls.split(';')

	result = None
	for i in range(len(sqlarray)):
	sql = sqlarray[i].rstrip()
	if sql != '':
	result = sqlContext.sql(sql)

	result.show()

	sc.stop()