Skip to content

Instantly share code, notes, and snippets.

View ChengzhiZhao's full-sized avatar

Chengzhi Zhao ChengzhiZhao

View GitHub Profile
@ChengzhiZhao
ChengzhiZhao / rust_plotters.rs
Created February 17, 2022 06:25
Rust Plotters Data Visualization
use plotters::prelude::*;
use std::error::Error;
use fast_float::parse;
fn read_csv() -> Result<(Vec<f64>,Vec<i32>), Box<dyn Error>> {
// Build the CSV reader and iterate over each record.
let mut rdr = csv::Reader::from_path("~/Downloads/kc_house_data.csv")?;
let mut price:Vec<f64> = Vec::new();
let mut sqft_living:Vec<i32> = Vec::new();
@ChengzhiZhao
ChengzhiZhao / rust_plotters.rs
Last active February 16, 2022 07:36
Rust Plotters Data Visulization
use plotters::prelude::*;
use std::error::Error;
fn read_csv() -> Result<(Vec<String>, Vec<f64>,Vec<f64>), Box<dyn Error>> {
// Build the CSV reader and iterate over each record.
// Data Source is from https://www.kaggle.com/johnharshith/hollywood-theatrical-market-synopsis-1995-to-2021/version/2?select=TopGenres.csv
let mut rdr = csv::Reader::from_path("~/Downloads/TopGenres.csv")?;
let mut genres:Vec<String> = Vec::new();
let mut movies:Vec<f64> = Vec::new();
let mut market_share:Vec<f64> = Vec::new();
@ChengzhiZhao
ChengzhiZhao / Perfect_hello_world_example.py
Created December 22, 2021 17:54
Hello World Example for Perfect
import prefect
from prefect import task, Flow
@task
def hello_task():
logger = prefect.context.get("logger")
logger.info("Hello world!")
with Flow("hello-flow") as flow:
hello_task()
@ChengzhiZhao
ChengzhiZhao / spark_SALT.scala
Created December 9, 2021 07:06
spark_SALT.scala
df.withColumn("salt_random_column", (rand * n).cast(IntegerType)) // n is the size of partition you'd like to have
.groupBy(groupByFields, "salt_random_column")
.agg(aggFields)
.groupBy(groupByFields)
.agg(aggFields)
@ChengzhiZhao
ChengzhiZhao / datafusion_1_0_0.rs
Created August 4, 2020 17:39
DataFusion Select Statement 1.0.0
use arrow::util::pretty;
use std::time::{Duration, Instant};
use datafusion::datasource::csv::CsvReadOptions;
use datafusion::error::Result;
use datafusion::execution::context::ExecutionContext;
/// This example demonstrates executing a simple query against an Arrow data source (CSV) and
/// fetching results
fn main() -> Result<()> {
let start = Instant::now();
@ChengzhiZhao
ChengzhiZhao / DataFusion_First_Select_Query.rs
Last active July 24, 2020 01:49
DataFusion First Select Query
use std::sync::Arc;
extern crate arrow;
extern crate datafusion;
use arrow::array::{Int32Array, Float64Array};
use arrow::datatypes::{DataType, Field, Schema};
use std::time::{Duration, Instant};
use datafusion::execution::context::ExecutionContext;
/// This example demonstrates executing a simple query against an Arrow data source (CSV) and
@ChengzhiZhao
ChengzhiZhao / Slickdeals_Analytics.py
Last active May 1, 2020 06:39
Slickdeals Analytics with Pandas and Plotly
import pandas as pd
import plotly.express as px
# Parse Data
url="https://raw.githubusercontent.com/ChengzhiZhao/jupyter-notebooks/master/slickdeals_data_file.csv"
df=pd.read_csv(url,header=None,names=['datetime','store','title','item_url','deal_price','original_price', 'like', 'comments'])
df['datetime'] = pd.to_datetime(df['datetime'])
# Data Anlytics#
# Frequency
@ChengzhiZhao
ChengzhiZhao / slick_deals_spider.py
Last active April 18, 2020 21:02
Slick Deals Crawler
import scrapy
import csv
from datetime import datetime
from scrapy.selector import Selector
class SlickDealsSpider(scrapy.Spider):
name = "slickdeals"
def start_requests(self):
@ChengzhiZhao
ChengzhiZhao / scheduler_101_dag.py
Created April 10, 2020 01:05
Airflow Scheduler Interval 101
from airflow.models import DAG
from datetime import datetime, timedelta
from airflow.operators.bash_operator import BashOperator
args = {
'owner': 'Airflow',
'start_date': datetime(2020, 4, 1),
'depends_on_past': True,
}
@ChengzhiZhao
ChengzhiZhao / Dask_Delayed.py
Last active April 4, 2020 01:08
Dask_Delayed
import dask
import dask.dataframe as dd
import time
from dask.distributed import Client, progress
client = Client(threads_per_worker=2, n_workers=2)
client
def call_api():
time.sleep(1)