Kun 1ambda

🦁

in the jungle

high-functioning developer trained by unsupervised compilers

1ambda / yarn-find-apps.sh

Last active March 2, 2019 08:16

yarn-find-apps.sh

	#!/usr/bin/env bash

	SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"

	function usage() {
	echo "Usage:

	${0##*/} [-h][-n=APP][-r=SECONDS][-b=DATETIME][-s=STATES][-o=ORDER]

	Options:

1ambda / Dockerfile

Created October 26, 2019 00:28

Dockerfile for extending jupyter/docker-stacks' minimal notebook

	FROM jupyter/minimal-notebook:1386e2046833

	# -----------------------------------------------------------------------------
	# --- Constants
	# -----------------------------------------------------------------------------

	USER $NB_USER

	WORKDIR /home/$NB_USER

1ambda / example.py

Created May 20, 2021 15:07

	# Airflow Worker 가 실행될 Kubernetes Pod 을 설정합니다.
	executorConfig = ExecutorBuilder(
	image = "dask-py38",
	resource = { memory: "80960Mi", cpu: "32" },
	resourceCapacityType = "SPOT",
	resourceNodeSelector = { "compute-type": "airflow-cpu-intensive", ... },
	notebookCustomPackages = ["pandas==1.2.3", "pyarrow==3.0.0"],
	notebookKernel = "python38",
	...
	)

1ambda / pyspark.py

Created December 20, 2021 01:00

	from pyspark.sql.functions import *
	from pyspark.sql.types import *

	# 현재 디렉토리에 CSV 파일을 다운받은 후 아래 코드를 실행합니다.
	# 해당 파일의 확장자는 `.csv` 로 되어있으나, 실제로 데이터의 구분자는 `\t` (탭) 입니다

	# DataBricks 로 실습한다면 경로를 "/FileStore/tables/marketing_campaign.csv" 로 변경합니다

	df = spark.read.load("./marketing_campaign.csv",
	format="csv",

1ambda / printSchema.py

Created December 20, 2021 01:02

	df.printSchema() # 스키마, 즉 데이터의 형태를 보여줍니다.

	root
	\|-- ID: integer (nullable = true)
	\|-- Year_Birth: integer (nullable = true)
	\|-- Education: string (nullable = true)
	\|-- Marital_Status: string (nullable = true)
	\|-- Income: integer (nullable = true)
	\|-- Kidhome: integer (nullable = true)
	\|-- Teenhome: integer (nullable = true)

1ambda / dataframe.py

Created December 20, 2021 01:04

	df.count() # 로딩한 데이터의 숫자를 센 후 출력합니다
	df.show() # 데이터를 일부 콘솔에 출력합니다.
	df.toPandas() # PySpark 에서 사용할 수 있는 함수로, Jupyter 에서 데이터를 편하게 볼 수 있습니다.

1ambda / dataframe_result.py

Created December 20, 2021 01:04

	# df.count() 의 결과
	2240

	# df.toPandas() 의 결과 (일부 Row, Column 생략)

	ID Year_Birth Education Marital_Status Income Kidhome Teenhome Dt_Customer Recency MntWines ... NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5 AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
	0 5524 1957 Graduation Single 58138.0 0 0 04-09-2012 58 635 ... 7 0 0 0 0 0 0 3 11 1
	1 2174 1954 Graduation Single 46344.0 1 1 08-03-2014 38 11 ... 5 0 0 0 0 0 0 3 11 0
	2 4141 1965 Graduation Together 71613.0 0 0 21-08-2013 26 426 ... 4 0 0 0 0 0 0 3 11 0
	3 6182 1984 Graduation Together 26646.0 1 0 10-02-2014 26 11 ... 6 0 0 0 0 0 0 3 11 0

1ambda / df-select.py

Created December 20, 2021 11:24

	# 컬럼을 선택하고 이름을 변경합니다.
	# SQL 의 SELECT 'ID' as id, 'Year_Birth' as 'year_birth'... 과 동일합니다.

	dfSelected = df.select(
	col("ID").alias("id"),
	col("Year_Birth").alias("year_birth"),
	col("Education").alias("education"),
	col("Kidhome").alias("count_kid"),
	col("Teenhome").alias("count_teen"),
	col("Dt_Customer").alias("date_customer"),

1ambda / df-select-result.py

Created December 20, 2021 11:24

	# dfSelected.count() 의 결과
	2240

	# dfSelected.printSchema()
	root
	\|-- id: integer (nullable = true)
	\|-- year_birth: integer (nullable = true)
	\|-- education: string (nullable = true)
	\|-- count_kid: integer (nullable = true)
	\|-- count_teen: integer (nullable = true)

1ambda / df-select-result.py

Created December 20, 2021 11:26

	# df.rdd.id() 실행 결과
	<bound method RDD.id of MapPartitionsRDD[25] at javaToPython at NativeMethodAccessorImpl.java:0>

	# dfSelected.rdd.id() 실행 결과
	<bound method RDD.id of MapPartitionsRDD[31] at javaToPython at NativeMethodAccessorImpl.java:0>