Skip to content

Instantly share code, notes, and snippets.

View kristoff-it's full-sized avatar
in your codebase, fixing your bugz

Loris Cro kristoff-it

in your codebase, fixing your bugz
View GitHub Profile
>>> no_pol = result.where(col("occupation") != "POLITICIAN")
>>> no_pol.write.format("org.apache.spark.sql.redis").option("table", "occupation").option("key.column", "countryCode").save()
>>> result.where(col("occupation") == "POLITICIAN").count()
150
>>> from pyspark.sql.window import Window
>>> from pyspark.sql.functions import count, col, row_number
>>> w = Window().partitionBy("countryCode").orderBy(col("count(en_curid)").desc())
>>> result = counts.withColumn("rn", row_number().over(w)).where(col("rn") == 1).select("countryCode", "occupation")
>>> result.show(5)
+-----------+-------------+
|countryCode| occupation|
+-----------+-------------+
| DZ| POLITICIAN|
| LT| POLITICIAN|
>>> counts = df.groupby("countryCode", "occupation").agg({"en_curid": "count"})
>>> counts.show(2)
+-----------+-------------+---------------+
|countryCode| occupation|count(en_curid)|
+-----------+-------------+---------------+
| FR|MATHEMATICIAN| 34|
| IT|SOCCER PLAYER| 81|
+-----------+-------------+---------------+
only showing top 2 rows
>>> df = spark.read.format("org.apache.spark.sql.redis").option("table", "people").option("key.column", "en_curid").load()
>>> df.show(2)
+--------+-----------+----------+
|en_curid|countryCode|occupation|
+--------+-----------+----------+
| 915950| ZW| SWIMMER|
| 726159| UY|POLITICIAN|
+--------+-----------+----------+
only showing top 2 rows
> HGETALL people:2113653
1) "countryCode"
2) "DE"
3) "occupation"
4) "SOCCER PLAYER"
$ redis-cli
> SCAN 0 MATCH people:* COUNT 3
1) "2048"
2) 1) "people:2113653"
2) "people:44849"
3) "people:399280"
4) "people:101393"
>>> data.write.format("org.apache.spark.sql.redis").option("table", "people").option("key.column", "en_curid").save()
>>> data = full_df.select("en_curid", "countryCode", "occupation")
>>> data.show(2)
+--------+-----------+-----------+
|en_curid|countryCode| occupation|
+--------+-----------+-----------+
| 307| US| POLITICIAN|
| 308| GR|PHILOSOPHER|
+--------+-----------+-----------+
only showing top 2 rows
>>> full_df = spark.read.csv("pantheon.tsv", sep="\t", quote="", header=True, inferSchema=True)
>>> full_df.dtypes
[('en_curid', 'int'), ('name', 'string'), ('numlangs', 'int'), ('birthcity', 'string'), ('birthstate', 'string'), ('countryName', 'string'), ('countryCode', 'string'), ('countryCode3', 'string'), ('LAT', 'double'), ('LON', 'double'), ('continentName', 'string'), ('birthyear', 'string'), ('gender', 'string'), ('occupation', 'string'), ('industry', 'string'), ('domain', 'string'), ('TotalPageViews', 'int'), ('L_star', 'double'), ('StdDevPageViews', 'double'), ('PageViewsEnglish', 'int'), ('PageViewsNonEnglish', 'int'), ('AverageViews', 'double'), ('HPI', 'double')]