Screen Recording: https://vimeo.com/942403540?share=copy
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from fsspec_gitlab import * | |
import fsspec | |
# Instantiate a gitlab repo filesystem | |
gitlab_repo = fsspec.filesystem("gitlab", base_uri="https://gitlab.wikimedia.org", project_path="repos/data-engineering/workflow_utils") | |
# list a directory | |
gitlab_repo.ls("workflow_utils", detail=True) | |
# open and read a file |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cd ~/ | |
mkdir flink-sql-libs | |
cd flink-sql-libs/ | |
wget https://repo1.maven.org/maven2/org/apache/flink/flink-connector-kafka/1.17.2/flink-connector-kafka-1.17.2.jar | |
wget https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.4.0/kafka-clients-3.4.0.jar | |
# Only need this if querying WMF event streams. | |
# https://wikitech.wikimedia.org/wiki/Event_Platform/Stream_Processing/Flink_Catalog#Creating_Tables | |
# wget https://archiva.wikimedia.org/repository/releases/org/wikimedia/eventutilities-flink/1.3.3/eventutilities-flink-1.3.3-jar-with-dependencies.jar |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
spark3-submit --class org.wikimedia.analytics.refinery.job.refine.tool.EvolveHiveTable ./refinery-job/target/refinery-job-0.2.28-SNAPSHOT-shaded.jar --table=event.mediawiki_page_change_v1 --schema_uri=/mediawiki/page/change/latest --dry_run=true | |
24/01/02 21:49:53 INFO DataFrameToHive: Found difference in schemas for Hive table otto.mw_page_change0 | |
Table schema: | |
root | |
-- _schema: string (nullable = true) | |
-- changelog_kind: string (nullable = true) | |
-- comment: string (nullable = true) | |
-- created_redirect_page: struct (nullable = true) | |
|-- is_redirect: boolean (nullable = true) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import logging | |
import sys | |
from pyflink.common import WatermarkStrategy, Encoder, Types | |
from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode, ProcessFunction, OutputTag | |
from pyflink.datastream.connectors.file_system import FileSource, StreamFormat, FileSink, OutputFileConfig, RollingPolicy | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import logging | |
import sys | |
from pyflink.common import WatermarkStrategy, Encoder, Types | |
from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode, ProcessFunction, OutputTag | |
from pyflink.datastream.connectors.file_system import FileSource, StreamFormat, FileSink, OutputFileConfig, RollingPolicy | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
FROM docker-registry.wikimedia.org/flink:1.16.0-37 | |
# add python script | |
USER root | |
RUN mkdir -p /srv/flink_app && ls | |
ADD python_demo.py /srv/flink_app/python_demo.py | |
USER flink | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# flake8: noqa | |
# This file is autogenerated by /metadata-ingestion/scripts/avro_codegen.py | |
# Do not modify manually! | |
# pylint: skip-file | |
# fmt: off | |
# The SchemaFromJSONData method only exists in avro-python3, but is called make_avsc_object in avro. | |
# We can use this fact to detect conflicts between the two packages. Pip won't detect those conflicts |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Need to download flink-connector-kafka-1.15.2.jar and kafka-clients-2.4.1.jar | |
./bin/sql-client.sh -i flink_sql_init.sql -pyfs get_revision_content_udf.py -pyexec /home/otto/pyflink_udf2/bin/python3 -pyclientexec /home/otto/pyflink_udf2/bin/python3 -j /home/otto/flink-connector-kafka-1.15.2.jar -j /home/otto/kafka-clients-2.4.1.jar |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TEMPORARY TABLE mediawiki_page_change ( | |
`wiki_id` STRING, | |
`meta` ROW<domain STRING>, | |
`page_change_kind` STRING, | |
`page` ROW<page_id BIGINT, page_title STRING>, | |
`revision` ROW<rev_id BIGINT, content_slots MAP<string, ROW<slot_role STRING, content_format STRING, content_body STRING>>> | |
) WITH ( | |
'connector' = 'kafka', |
NewerOlder