Last active
August 10, 2023 13:55
-
-
Save grobbie/c3ef7b1e294c3f18151f2f9f5b3e5106 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pyspark.sql import SparkSession | |
from pyspark import SparkFiles | |
spark = SparkSession.builder \ | |
.appName("LinuxDistributionStabilityRanking") \ | |
.getOrCreate() | |
spark.conf.set("fs.s3a.attempts.maximum", "1") | |
spark.conf.set("fs.s3a.connection.establish.timeout", "5000") | |
spark.conf.set("fs.s3a.connection.timeout", "10000") | |
spark.conf.set("spark.hadoop.fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider") | |
spark.conf.set("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") | |
spark.conf.set("spark.hadoop.fs.s3a.path.style.access", "true") | |
spark.conf.set("spark.hadoop.fs.s3a.access.key", "<REDACTED>") | |
spark.conf.set("spark.hadoop.fs.s3a.endpoint", "<REDACTED>") | |
spark.conf.set("spark.hadoop.fs.s3a.secret.key", "<REDACTED>") | |
spark.conf.set("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") | |
df = spark.read.csv("s3a://data/stack_overflow_data.csv", header=True, inferSchema=True) | |
from pyspark.sql.functions import sum | |
distribution_issues = df.groupBy("distro").agg(sum("vote_count").alias("total_issues")) | |
# You can adjust the formula to suit your specific ranking criteria. | |
# In this example, we'll use a simple formula where lower issue count means higher stability. | |
max_issue_count = distribution_issues.selectExpr("max(total_issues)").collect()[0][0] | |
from pyspark.sql.window import Window | |
from pyspark.sql.functions import rank, col | |
windowSpec = Window.orderBy(col("total_issues").desc()) | |
ranked_distribution = distribution_issues.withColumn("rank", rank().over(windowSpec)) | |
ranked_distribution.show() | |
spark.stop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment