Databracket9 · May 9, 2024 19:05
diff --git a/duckdb_postgres_pyspark_tutorial b/duckdb_postgres_pyspark_tutorial
 #library import
 import duckdb 
 import configparser
 from duckdb.experimental.spark.sql import SparkSession as session
 from duckdb.experimental.spark.sql.functions import col, when, lit

 #read configs from secrets file
 config = configparser.ConfigParser()
 config.read('duck_db_demo/secrets.ini')
 user = config['POSTGRES']['USER']
 password = config['POSTGRES']['PASSWORD']
 port = config['POSTGRES']['PORT']
 database = config['POSTGRES']['DATABASE']

 #install and load postgres extension
 duckdb.execute('INSTALL postgres;')
 duckdb.execute('LOAD postgres;')

 #connect to postgres
 duckdb.execute("ATTACH 'dbname={} user={} password={} host=127.0.0.1 port={}' AS db (TYPE POSTGRES);".format(database, user, password, port))

 #query postgres tables to test connectivity
 pg_tables = duckdb.sql("select * from pg_tables")

 #read csv file
 fake_currency = duckdb.read_csv("data/fake_currency_data.csv")

 #use csv dataframe variable as sql view without convertions
 temp_var = duckdb.sql("select * from fake_currency where Country = 'UK'")

 #convert to pandas dataframe
 temp_df = temp_var.df()

 #create sparksession
 spark = session.builder.getOrCreate()

 #pandas dataframe to pyspark dataframe
 UK_df = spark.createDataFrame(temp_df)

 #pyspark transformations
 UK_df = UK_df.withColumn("HighThickness", when((col("Thickness") > 0.05), col("Thickness")))
 UK_df = UK_df.where(col("SecurityFeatures") == "Watermark")
 UK_df = UK_df.toPandas()

 #data load or write to postgres sql
 duckdb.execute("CREATE TABLE IF NOT EXISTS db.uk_watername_sample AS FROM UK_df")
	#library import
	import duckdb
	import configparser
	from duckdb.experimental.spark.sql import SparkSession as session
	from duckdb.experimental.spark.sql.functions import col, when, lit

	#read configs from secrets file
	config = configparser.ConfigParser()
	config.read('duck_db_demo/secrets.ini')
	user = config['POSTGRES']['USER']
	password = config['POSTGRES']['PASSWORD']
	port = config['POSTGRES']['PORT']
	database = config['POSTGRES']['DATABASE']

	#install and load postgres extension
	duckdb.execute('INSTALL postgres;')
	duckdb.execute('LOAD postgres;')

	#connect to postgres
	duckdb.execute("ATTACH 'dbname={} user={} password={} host=127.0.0.1 port={}' AS db (TYPE POSTGRES);".format(database, user, password, port))

	#query postgres tables to test connectivity
	pg_tables = duckdb.sql("select * from pg_tables")

	#read csv file
	fake_currency = duckdb.read_csv("data/fake_currency_data.csv")

	#use csv dataframe variable as sql view without convertions
	temp_var = duckdb.sql("select * from fake_currency where Country = 'UK'")

	#convert to pandas dataframe
	temp_df = temp_var.df()

	#create sparksession
	spark = session.builder.getOrCreate()

	#pandas dataframe to pyspark dataframe
	UK_df = spark.createDataFrame(temp_df)

	#pyspark transformations
	UK_df = UK_df.withColumn("HighThickness", when((col("Thickness") > 0.05), col("Thickness")))
	UK_df = UK_df.where(col("SecurityFeatures") == "Watermark")
	UK_df = UK_df.toPandas()

	#data load or write to postgres sql
	duckdb.execute("CREATE TABLE IF NOT EXISTS db.uk_watername_sample AS FROM UK_df")
No results found