nmukerje · March 1, 2021 04:07
diff --git a/gdelta_parquet.py b/gdelta_parquet.py
 # Get the column names
 from urllib import urlopen
 html = urlopen("http://gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt").read().rstrip()
 columns = html.split('\t')
 # Load 73,385,698 records from 2016
 df1 = spark.read.option("delimiter", "\t").csv("s3://gdelt-open-data/events/2016*") 
 # Apply the schema
 df2=df1.toDF(*columns)
 # Split SQLDATE to Year, Month and Day
 from pyspark.sql.functions import expr
 df3 = df2.withColumn("Month", expr("substring(SQLDATE, 5, 2)")).withColumn("Day", expr("substring(SQLDATE, 7, 2)"))
 # Write to parquet in S3
 cols=["Year","Month","Day"]
 df3.repartition(*cols).write.mode("append").partitionBy(cols).parquet("s3://<bucket>/gdelt/")
	# Get the column names
	from urllib import urlopen
	html = urlopen("http://gdeltproject.org/data/lookups/CSV.header.dailyupdates.txt").read().rstrip()
	columns = html.split('\t')
	# Load 73,385,698 records from 2016
	df1 = spark.read.option("delimiter", "\t").csv("s3://gdelt-open-data/events/2016*")
	# Apply the schema
	df2=df1.toDF(*columns)
	# Split SQLDATE to Year, Month and Day
	from pyspark.sql.functions import expr
	df3 = df2.withColumn("Month", expr("substring(SQLDATE, 5, 2)")).withColumn("Day", expr("substring(SQLDATE, 7, 2)"))
	# Write to parquet in S3
	cols=["Year","Month","Day"]
	df3.repartition(*cols).write.mode("append").partitionBy(cols).parquet("s3://<bucket>/gdelt/")