Created
August 3, 2020 15:29
-
-
Save snehamehrin/1d2a97f037ffd35dbd07b667f210ae32 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql.functions import unix_timestamp, to_date, date_format, month, year, dayofyear, dayofweek, col | |
from pyspark.sql.types import TimestampType | |
#Intialize Spark Session | |
from pyspark.sql import SparkSession | |
spark = SparkSession \ | |
.builder \ | |
.appName('Stack Overflow ML') \ | |
.getOrCreate() | |
print('Session created') | |
sc = spark.sparkContext | |
#Import the file | |
stack = sc.textFile('mnt/stack-overflow-bucket/stack_firehose_stream', 1) | |
#Split the dictionary of rows into columns | |
df = spark.read.json(stack) | |
#Total Number of Records | |
df.count() | |
#Count The duplicate questions | |
import pyspark.sql.functions as F | |
df_no_dup=df_duplicates.select([col for col in df.columns]).groupBy('question_id').agg(F.count('question_id').alias('dup_cnt')) | |
df_no_dup.count() | |
#Drop Duplicates | |
df_duplicates=df.dropDuplicates(['question_id']) | |
#Statistics for answer_column | |
df_duplicates.select('answer_count').describe().show() | |
#Converting Dates to | |
df_duplicates=df_duplicates.withColumn("created_date", F.from_unixtime("creation_date", "dd/MM/yyyy")) | |
df_duplicates=df_duplicates.withColumn("last_activity_date", F.from_unixtime("last_activity_date", "dd/MM/yyyy")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment