Last active
September 16, 2023 11:03
-
-
Save Nandan-N/05009bb04a2b4fd36aa020a197a1218a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar \ | |
-mapper "$PWD/mapper.py" \ | |
-reducer "$PWD/reducer.py" \ | |
-input <path_to_input_in_hdfs> \ | |
-output <path_to_output_folder_in_hdfs> | |
cat sample_data.json | ./mapper.py | sort -k 1,1 | ./reducer.py | |
#!/usr/bin/env python | |
import sys | |
import json | |
cat sample_data.json | ./mapper.py | sort -k 1,1 | ./reducer.py | |
hadoop jar $HADOOP_HOME/share/hadoop/tools/lib/hadoop-streaming-3.3.6.jar \ | |
-mapper "$PWD/mapper.py" \ | |
-reducer "$PWD/reducer.py" \ | |
-input <path_to_input_in_hdfs> \ | |
-output <path_to_output_folder_in_hdfs> | |
for line in sys.stdin: | |
try: | |
data = json.loads(line) | |
name = data["name"] | |
runs = float(data["runs"]) | |
balls = float(data["balls"]) | |
# Calculate local strike rate (rounding to 3 decimal places) | |
if balls > 0: | |
strike_rate = round((runs / balls) * 100, 3) | |
else: | |
strike_rate = 0.000 # If balls are 0, strike rate is also 0 | |
# Emit the name and local strike rate as key-value pairs | |
print(f"{name}\t{strike_rate}") | |
except ValueError: | |
continue | |
#!/usr/bin/env python | |
import sys | |
current_name = None | |
total_runs = 0 | |
total_balls = 0 | |
for line in sys.stdin: | |
try: | |
name, strike_rate = line.strip().split("\t") | |
strike_rate = float(strike_rate) | |
# If the name changes, it's a new batsman, so emit the aggregate strike rate for the previous batsman | |
if current_name and current_name != name: | |
if total_balls > 0: | |
aggregate_strike_rate = round((total_runs / total_balls) * 100, 3) | |
else: | |
aggregate_strike_rate = 0.000 | |
print(f'{{"name": "{current_name}", "strike_rate": {aggregate_strike_rate}}}') | |
# Reset counters for the new batsman | |
total_runs = 0 | |
total_balls = 0 | |
# Update counters for the current batsman | |
current_name = name | |
total_runs += strike_rate * 100 # Convert strike rate back to runs | |
total_balls += 100 | |
except ValueError: | |
continue | |
# Emit the last batsman's aggregate strike rate | |
if current_name: | |
if total_balls > 0: | |
aggregate_strike_rate = round((total_runs / total_balls) * 100, 3) | |
else: | |
aggregate_strike_rate = 0.000 | |
print(f'{{"name": "{current_name}", "strike_rate": {aggregate_strike_rate}}}') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment