Skip to content

Instantly share code, notes, and snippets.

@grobbie
Last active August 10, 2023 13:55
Show Gist options
  • Save grobbie/c3ef7b1e294c3f18151f2f9f5b3e5106 to your computer and use it in GitHub Desktop.
Save grobbie/c3ef7b1e294c3f18151f2f9f5b3e5106 to your computer and use it in GitHub Desktop.
from pyspark.sql import SparkSession
from pyspark import SparkFiles
spark = SparkSession.builder \
.appName("LinuxDistributionStabilityRanking") \
.getOrCreate()
spark.conf.set("fs.s3a.attempts.maximum", "1")
spark.conf.set("fs.s3a.connection.establish.timeout", "5000")
spark.conf.set("fs.s3a.connection.timeout", "10000")
spark.conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider")
spark.conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
spark.conf.set("spark.hadoop.fs.s3a.path.style.access", "true")
spark.conf.set("spark.hadoop.fs.s3a.access.key", "<REDACTED>")
spark.conf.set("spark.hadoop.fs.s3a.endpoint", "<REDACTED>")
spark.conf.set("spark.hadoop.fs.s3a.secret.key", "<REDACTED>")
spark.conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
df = spark.read.csv("s3a://data/stack_overflow_data.csv", header=True, inferSchema=True)
from pyspark.sql.functions import sum
distribution_issues = df.groupBy("distro").agg(sum("vote_count").alias("total_issues"))
# You can adjust the formula to suit your specific ranking criteria.
# In this example, we'll use a simple formula where lower issue count means higher stability.
max_issue_count = distribution_issues.selectExpr("max(total_issues)").collect()[0][0]
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
windowSpec = Window.orderBy(col("total_issues").desc())
ranked_distribution = distribution_issues.withColumn("rank", rank().over(windowSpec))
ranked_distribution.show()
spark.stop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment