Related Setup: https://gist.github.com/hofmannsven/6814278
Related Pro Tips: https://ochronus.com/git-tips-from-the-trenches/
Related Setup: https://gist.github.com/hofmannsven/6814278
Related Pro Tips: https://ochronus.com/git-tips-from-the-trenches/
| /** | |
| * Get the stackexchange data from https://archive.org/details/stackexchange | |
| * Data set used here : math.stackexchange.com | |
| **/ | |
| //Open the file. The text file is an RDD (Resilient Distributed Dataset) | |
| //of Strings, which are the lines of the file. | |
| val postXML = sc.textFile("Posts.xml") | |
| //Count the lines. Note: Run twice and see the difference ;) |
| - Format: 7zipped | |
| - Files: | |
| - **badges**.xml | |
| - UserId, e.g.: "420" | |
| - Name, e.g.: "Teacher" | |
| - Date, e.g.: "2008-09-15T08:55:03.923" | |
| - **comments**.xml | |
| - Id | |
| - PostId | |
| - Score |
| package org.aja.tej.tej.test.spark | |
| /** | |
| * Created by mageswaran on 9/8/15. | |
| */ | |
| import java.util.Random | |
| import org.apache.spark.{SparkConf, SparkContext} |
| package org.aja.tej.examples | |
| import java.io.File | |
| import org.aja.tej.utils.TejUtils | |
| import org.apache.spark.{SparkConf, SparkContext} | |
| /** |
| package org.aja.tej.examples.ml | |
| import org.aja.tej.utils.TejUtils | |
| import org.apache.spark.ml.classification.MultilayerPerceptronClassifier | |
| import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator | |
| import org.apache.spark.mllib.util.MLUtils | |
| import org.apache.spark.sql.SQLContext | |
| /** | |
| * Created by mageswaran on 25/9/15. |
| package org.aja.tej.tej.test.spark | |
| /** | |
| * Created by mageswaran on 9/8/15. | |
| */ | |
| import java.util.Random | |
| import org.apache.spark.{SparkConf, SparkContext} |
| //For any updates check : https://github.com/Mageswaran1989/aja/blob/master/src/examples/scala/org/aja/tej/examples/streaming/twitter/TwitterWithNeo4j.scala | |
| package org.aja.tej.examples.streaming.twitter | |
| import com.google.gson.Gson | |
| import org.aja.tej.utils.{TejUtils, TejTwitterUtils} | |
| import org.anormcypher.{Cypher, Neo4jREST} | |
| import org.apache.spark.sql.{AnalysisException, Row, SQLContext} | |
| import org.apache.spark.streaming.twitter.TwitterUtils | |
| import org.apache.spark.streaming.{Seconds, StreamingContext} | |
| import play.api.libs.ws.ning |
| package org.aja.tej.examples.streaming.twitter | |
| import com.google.gson.Gson | |
| import org.aja.tej.utils.{TejUtils, TejTwitterUtils} | |
| import org.anormcypher.{Cypher, Neo4jREST} | |
| import org.apache.spark.sql.{AnalysisException, Row, SQLContext} | |
| import org.apache.spark.streaming.twitter.TwitterUtils | |
| import org.apache.spark.streaming.{Seconds, StreamingContext} | |
| import play.api.libs.ws.ning |
| import os | |
| import boto3 | |
| from collections import defaultdict | |
| import botocore | |
| def get_matching_s3_objects(bucket, | |
| aws_access_key_id, | |
| aws_secret_access_key, | |
| region_name, | |
| prefix='', |