kshirsagarsiddharth’s gists

kshirsagarsiddharth / location_filter.py

Created June 23, 2021 16:39

	location_filtered = users.filter(F.col("location").rlike(r"india\|usa\|china"))

	filtered_with_location = location_filtered.join(combined_and_filtered, on = 'user_id', how = 'inner')\
	.select('user_id','isbn','book_rating','book_title')\
	.withColumn('user_id',F.col('user_id').cast('int'))

kshirsagarsiddharth / combine.py

Created June 23, 2021 16:35

	ratings.createOrReplaceTempView('ratings')
	books.createOrReplaceTempView('books')

	combined_dataframe = spark.sql("""
	SELECT user_id,r.isbn,book_rating,book_title
	FROM ratings r INNER JOIN books b
	ON r.isbn = b.isbn
	""")
	# we only take those books for which the 15 or more users have rated and those books which have 50 or more rating

kshirsagarsiddharth / location.py

Created June 23, 2021 16:28

	temp_df = (users.groupBy('location')
	.count()
	.select('location',F.col('count').cast('int'))
	.toPandas()
	.sort_values(by = 'count'))
	sns.distplot(temp_df['count'])
	plt.title('Location')
	plt.show()

kshirsagarsiddharth / age.py

Created June 23, 2021 16:28

	temp_df = (users.groupBy('age')
	.count()
	.select('age',F.col('count').cast('int'))
	.toPandas()
	.sort_values(by = 'count'))
	sns.distplot(temp_df['count'])
	plt.title('age')
	plt.show()

kshirsagarsiddharth / rating.py

Created June 23, 2021 16:25

	## lets visualize some key parameters
	import pyspark.sql.functions as F
	import seaborn as sns
	temp_df = (ratings.groupBy('book_rating')
	.count()
	.select('book_rating',F.col('count').cast('int'))
	.toPandas()
	.sort_values(by = 'count'))
	sns.barplot(x = 'book_rating', y = 'count', data = temp_df)

kshirsagarsiddharth / rename_columns.py

Created June 8, 2021 05:10

	books = books.withColumnRenamed("ISBN","isbn") \
	.withColumnRenamed("Book-Title","book_title") \
	.withColumnRenamed('Book-Author','book_author') \

	users = users.withColumnRenamed('User-ID','user_id')\
	.withColumnRenamed('Location','location')\
	.withColumnRenamed('Age','age')

	ratings = ratings.withColumnRenamed('User-ID','user_id')\
	.withColumnRenamed('ISBN','isbn')\

kshirsagarsiddharth / load_data.py

Created June 8, 2021 04:53

recommendation loading data

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt

	from pyspark.sql import SparkSession
	spark = SparkSession.builder.appName('rec').getOrCreate()

	books_rating_path = "book_recommendation/book_recommendation/BX-Book-Ratings.csv"
	books_path = "book_recommendation/book_recommendation/BX-Books.csv"
	books_user_path = "book_recommendation/book_recommendation/BX-Users.csv"

kshirsagarsiddharth / find_quotient.py

Created October 16, 2020 12:34

find quotient of two numbers

	def find_quotient(x,y):
	# keep the value of power as large as possible
	result,power = 0,16
	# this is (2^k)y inttially defined
	# in this case it is (2^16)y
	y_power = y << power

	# because we are successfully dividing we stop when
	# x is less than y
	while x >= y:

kshirsagarsiddharth / crawler4.py

Created September 1, 2020 12:06

crawler4

	print("The Crawler is started")
	base_url = input("Please Enter Website to Crawl > ")
	number_of_threads = input("Please Enter number of Threads > ")

	links_to_crawl = queue.Queue()
	url_lock = threading.Lock()
	links_to_crawl.put(base_url)

	have_visited = set()
	crawler_threads = []

kshirsagarsiddharth / crawler3.py

Created September 1, 2020 12:05

crawler3

	def run(self):
	# we create a ssl context so that our script can crawl
	# the https sties with ssl_handshake.

	#Create a SSLContext object with default settings.
	my_ssl = ssl.create_default_context()

	# by default when creating a default ssl context and making an handshake
	# we verify the hostname with the certificate but our objective is to crawl
	# the webpage so we will not be checking the validity of the cerfificate.