Let's import the page and category links table from the plwiki database.
Sqoop will need JAVA_HOME set. We (ops) should make it so this is set for all user shells. #TODO
export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-amd64| """ | |
| Example pyspark code to integrate with Wikimedia Event Platform to automate | |
| getting a Spark Structured Streaming DataFrame using event streams and event JSONSchemas. | |
| See also: https://wikitech.wikimedia.org/wiki/Event_Platform | |
| You'll need the following jar dependencies: | |
| - Kafka Client: |
| /usr/bin/spark2-submit \ | |
| --name otto_test_refine_eventlogging_0 \ | |
| --class org.wikimedia.analytics.refinery.job.refine.Refine \ | |
| --master yarn \ | |
| --deploy-mode client \ | |
| --conf spark.driver.extraClassPath=/usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-common.jar:/srv/deployment/analytics/refinery/artifacts/hive-jdbc-1.1.0-cdh5.10.0.jar:/srv/deployment/analytics/refinery/artifacts/hive-service-1.1.0-cdh5.10.0.jar \ | |
| --driver-java-options='-Drefine.log.level=DEBUG -Drefinery.log.level=DEBUG -Dhttp.proxyHost=webproxy.eqiad.wmnet -Dhttp.proxyPort=8080 -Dhttps.proxyHost=webproxy.eqiad.wmnet -Dhttps.proxyPort=8080' \ | |
| /home/otto/refinery-source/refinery-job/target/refinery-job-0.0.89-SNAPSHOT.jar \ | |
| --database=otto_json_refine_test \ | |
| --hive_server_url=an-coord1001.eqiad.wmnet:10000 \ |
| # Example of using an InitContainer in place of a GitRepo volume. | |
| # Unilke GitRepo volumes, this approach runs the git command in a container, | |
| # with the associated hardening. | |
| apiVersion: v1 | |
| kind: Pod | |
| metadata: | |
| name: git-repo-demo | |
| annotations: | |
| seccomp.security.alpha.kubernetes.io/pod: 'docker/default' | |
| spec: |
| // This will do the 'while(true)' trick for you. | |
| // The callback should return a promise, that's fulfilled when this batch processing is done | |
| // However, I'm not 100% sure this is the API we want. I would better see the Consumer inherit | |
| // from an EventEmitter, and emit `message` message for each consumed message, and `error` for each error, | |
| // same as kafka-node does. It feels more natural to me, but it's a bigger discussion. | |
| class Consumer { | |
| constructor(rawConsumer) { | |
| this.rawConsumer = rawConsumer; | |
| this.isStarted = false; |
| -- this is a collection of odd findings while trying to reconstruct mediawiki history | |
| -- 1. revisions that should stand for a page creation without a corresponding record in the page table | |
| -- REASON: move_redir orphans records in the revision table instead of archiving them like the move_and_delete | |
| -- MovePage::moveToInternal $moveOverRedirect, $newpage->doDeleteUpdates 503 | |
| -- as opposed to the correct: SpeicalMovepage.php $page->doDeleteArticleReal line 559 | |
| select count(*) | |
| from revision | |
| where rev_parent_id = 0 | |
| and rev_page not in (select page_id from page); |
Let's import the page and category links table from the plwiki database.
Sqoop will need JAVA_HOME set. We (ops) should make it so this is set for all user shells. #TODO
export JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-amd64| set start_day=16; | |
| set end_day=17; | |
| set table_name=varnish2; | |
| select M.hostname, | |
| M.lower_seq, | |
| M.upper_seq, | |
| M.total_seqs, | |
| (M.expected_seqs - M.total_seqs) as missing_seqs, | |
| (M.expected_seqs - M.total_seqs) / M.expected_seqs as average_loss |
| /* Common setup, two variants follow | |
| */ | |
| use test; | |
| set tablename=webrequest_esams0; | |
| add jar /home/otto/hive-serdes-1.0-SNAPSHOT.jar; | |
| add jar /usr/lib/hive/lib/hive-contrib-0.10.0-cdh4.3.1.jar; | |
| create temporary function rowSequence AS 'org.apache.hadoop.hive.contrib.udf.UDFRowSequence'; |
| Java library Debian package Platform Exact version match | |
| metrics-core-2.2.0.jar Could not find it | |
| metrics-annotation-2.2.0.jar Could not find it | |
| zkclient-0.2.jar Seems to have been deprecated at V0.1 | |
| jopt-simple-3.2.jar libjoptsimple-java 3.1-3 precise/universe No | |
| scala-compiler.jar scala Possibly | |
| slf4j-api-1.7.2.jar libslf4j-java 1.6.4-1 precise/universe No | |
| snappy-java-1.0.4.1.jar libsnappy-java (1.0.4.1~dfsg-1) Yes |