Last active
January 16, 2024 14:12
-
-
Save ottomata/ae2b8d5477f9991c15b21bdfed24fe09 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
cd ~/ | |
mkdir flink-sql-libs | |
cd flink-sql-libs/ | |
wget https://repo1.maven.org/maven2/org/apache/flink/flink-connector-kafka/1.17.2/flink-connector-kafka-1.17.2.jar | |
wget https://repo1.maven.org/maven2/org/apache/kafka/kafka-clients/3.4.0/kafka-clients-3.4.0.jar | |
# Only need this if querying WMF event streams. | |
# https://wikitech.wikimedia.org/wiki/Event_Platform/Stream_Processing/Flink_Catalog#Creating_Tables | |
# wget https://archiva.wikimedia.org/repository/releases/org/wikimedia/eventutilities-flink/1.3.3/eventutilities-flink-1.3.3-jar-with-dependencies.jar | |
# download flink | |
cd ~/ | |
wget https://dlcdn.apache.org/flink/flink-1.17.2/flink-1.17.2-bin-scala_2.12.tgz | |
tar -xvzf flink-1.17.2-bin-scala_2.12.tgz | |
cd flink-1.17.2 | |
./bin/sql-client.sh -l ~/flink-sql-libs | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
USE CATALOG default_catalog; | |
-- NOTE: When we do T314956 - [Event Platform] Declare webrequest as an Event Platform stream | |
-- https://phabricator.wikimedia.org/T314956, we won't need to do this. | |
CREATE TEMPORARY TABLE `webrequest_tmp0`( | |
`hostname` string COMMENT 'Source node hostname', | |
`sequence` bigint COMMENT 'Per host sequence number', | |
`dt` string COMMENT 'Timestamp at cache in ISO 8601', | |
`time_firstbyte` double COMMENT 'Time to first byte', | |
`ip` string COMMENT 'IP of packet at cache', | |
`cache_status` string COMMENT 'Cache status', | |
`http_status` string COMMENT 'HTTP status of response', | |
`response_size` bigint COMMENT 'Response size', | |
`http_method` string COMMENT 'HTTP method of request', | |
`uri_host` string COMMENT 'Host of request', | |
`uri_path` string COMMENT 'Path of request', | |
`uri_query` string COMMENT 'Query of request', | |
`content_type` string COMMENT 'Content-Type header of response', | |
`referer` string COMMENT 'Referer header of request', | |
`user_agent` string COMMENT 'User-Agent header of request', | |
`accept_language` string COMMENT 'Accept-Language header of request', | |
`x_analytics` string COMMENT 'X-Analytics header of response', | |
`range` string COMMENT 'Range header of response', | |
`x_cache` string COMMENT 'X-Cache header of response', | |
`ts` TIMESTAMP(3) METADATA FROM 'timestamp' | |
) WITH ( | |
'connector' = 'kafka', | |
'topic' = 'webrequest_text', | |
'properties.bootstrap.servers' = 'kafka-jumbo1007.eqiad.wmnet:9092', | |
'properties.group.id' = 'otto-webrequest-text0', | |
'scan.startup.mode' = 'latest-offset', | |
'format' = 'json', | |
'json.ignore-parse-errors' = 'true' | |
); | |
Flink SQL> select uri_host from webrequest_tmp0 limit 100; | |
uri_host | |
en.wikipedia.org | |
en.wikipedia.org | |
en.wikipedia.org | |
www.wikipedia.org |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment