MU2324 · February 2, 2025 19:02
diff --git a/Cassandra Steps.txt b/Cassandra Steps.txt
 # Cassandra Steps:-

 1) CREATE KEYSPACE test WITH replication = {'class':'SimpleStrategy','replication_factor':1};
 2) USE test
 3) CREATE TABLE users (
        ...     user_id UUID PRIMARY KEY,
        ...     name TEXT,
        ...     email TEXT,
        ...     age INT
        ... );
 4) INSERT INTO users(user_id, name, email, age)
        ... VALUES (uuid(), 'John Doe', '[email protected]',25);
   INSERT INTO users(user_id, name, email, age)
        ... VALUES (uuid(), 'Jane Smith', '[email protected]',30);
 5) SELECT * FROM users;
 6) UPDATE users SET age = 26 WHERE user_id = <user_id>;
 7) SELECT * FROM users;
 8) DELETE FROM users WHERE user_id = 98cc5678-4744-47d2-88c8-69d3214061de;
 9) SELECT * FROM users;
 10) DROP TABLE users;
 11) DROP KEYSPACE test;
diff --git a/ds.ipynb b/ds.ipynb
diff --git a/ds.py b/ds.py
 #2.1) JSON TO HORUS

 import json

 def json_to_horus(input_file, output_file):
    with open(input_file, "r") as f:
        data = json.load(f)

    def convert(obj, indent=0):
        result = ""
        for key, value in obj.items():
            if isinstance(value, dict):
                result += " " * indent + f"[{key}]\n" + convert(value, indent + 4)
            else:
                result += " " * indent + f"{key}: {value}\n"
        return result

    with open(output_file, "w") as f:
        f.write(convert(data))

 # Example usage
 json_to_horus("input.json", "output.horus")

 #2.2) Text Delimited CSV to HORUS format.

 import csv

 def csv_to_horus(input_file, output_file, delimiter=",", encoding='utf-8'):
    with open(input_file, "r", encoding=encoding) as f:
        reader = csv.reader(f, delimiter=delimiter)
        headers = next(reader)  # Read header row

        horus_output = ""
        for i, row in enumerate(reader, start=1):
            horus_output += f"[Record {i}]\n"
            for key, value in zip(headers, row):
                horus_output += f"    {key}: {value}\n"
            horus_output += "\n"

    with open(output_file, "w") as f:
        f.write(horus_output)

 # Example usage
 csv_to_horus("Country_Code.csv", "output1.horus", delimiter=",", encoding='latin-1')

 #3.1) Demonstrate Fixers Utilities
 #Cleaning Data

 import string
 import datetime as dt

 dataset="   Data Science with too many spaces is bad  ."

 print("Before Cleaning\n",dataset)
 print("Clean Dataset \n", dataset.strip())


 #Removing Non-Printable Characters

 printable = set(string.printable)
 print(printable)

 bad_data = "Data\x00Science with\x02 funny characters is \x10bad!!!"
 clean_data = ''.join(filter(lambda x: x in string.printable, bad_data))

 print(bad_data)
 print(clean_data)

 #Reformatting data entry to match specific formatting criteria.

 baddate = dt.date(2019, 10 ,31)
 baddata = format(baddate,'%Y-%m-%d')
 print(baddata)
 gooddate = dt.datetime.strptime(baddata, '%Y-%m-%d')
 gooddata = format(gooddate, '%d %B %Y')
 print(gooddata)

 #3.2) Demonstrate Averaging of Data.

 import pandas as pd
 inputfile = "sample_large_data.csv"
 df = pd.read_csv(inputfile)

 print(df)

 processdata = df
 meandata = processdata.groupby(['Gender'])['Salary'].mean()
 print(meandata)

 #3.3)Data Binning or Bucketing.

 import pandas as pd

 raw_data = {
    "City" :
    ["sawantwadi","kudal","malvan","Devgad","Dodamarg","Kankavli","Vaibhavwadi","vengurla"],
    "Rank": ["1st","2nd","1st","2nd","1st","2nd","1st","2nd"],
    "Score1" : [48,55,45,80,60,100,90,75],
    "Score2" : [20,60,85,96,55,65,30,96]
 }

 df = pd.DataFrame(raw_data,
                  index = pd.Index(['A','B','C','D','E','F','G','H'], name ='letter'),
                  columns= pd.Index(['City','Rank','Score1','Score2'], name = 'Attributes')
                  )

 print(df)

 bins = [0,25,50,75,100]

 group = ['low' , 'average' , 'good' , 'brilliant']

 df['grade'] = pd.cut(df['Score2'],bins,labels= group)
 print(df)

 #5.1) Write R program to create the network routing diagram from the given data on routers.

 library(igraph)

 router_d <- data.frame(
 	Source = c(("Router1", "Router1", "Router2", "Router2", "Router3", "Router4"),
 	Destination = c("Router2", "Router3", "Router3", "Router4", "Router4", "Router5"),
 	Cost = c(10, 15, 5, 20, 10, 25)
 )

 g <- graph_from_data_frame(router_d,directed =TRUE)

 plot(
 	g,
 	edge.label=E(g)$Cost,
 	vertex.size = 30,
 	vertex.label.cex =1.2,
 	edge.arrow.size = 0.5,
 	main = "Network Routing Diagram"
 )

 #5.2) Write a R program to build acyclic graph.

 library(igraph)

 nodes <- c("A", "B", "C", "D", "E")
 edges <- c("A", "B", "A", "C", "B", "D", "C", "E", "D", "E")

 graph <- graph(edges = edges, directed = TRUE)

 if (is_dag(graph)) {
 	print("The graph is acyclic (DAG).")
 }else {
 	print("The graph contains cycles.")
 }


 plot(graph, vertex.color="skyblue", vertex.size=30, edge.arrow.size=0.5, vertex.label.cex=1.2, main="Acyclic Graph")

 #6)Build the time hub, links and satellites.

 import pandas as pd
 import sqlite3
 from datetime import datetime, timedelta
 import uuid

 # Create a SQLite database (or connect to an existing one)
 conn = sqlite3.connect("time_hub.db")

 # Create Time Hub
 def create_time_hub():
    time_data = []
    base_time = datetime(2023, 1, 1, 0, 0, 0)  # Starting point for time data
    for i in range(100):  # Generate 100 timestamps
        current_time = base_time + timedelta(minutes=i * 15)
        time_id = str(uuid.uuid4())
        time_data.append({"TimeID": time_id, "Timestamp": current_time})
    time_hub = pd.DataFrame(time_data)
    return time_hub

 # Create Time Satellites
 def create_time_satellites(time_hub):
    time_satellites = []
    for _, row in time_hub.iterrows():
        timestamp = row["Timestamp"]
        time_satellites.append({
            "TimeID": row["TimeID"],
            "Year": timestamp.year,
            "Month": timestamp.month,
            "Day": timestamp.day,
            "Hour": timestamp.hour,
            "Minute": timestamp.minute
        })
    time_satellite_df = pd.DataFrame(time_satellites)
    return time_satellite_df

 # Create Time Links
 def create_time_links(time_hub):
    time_zones = ["UTC", "EST", "PST", "CET"]
    time_links = []
    for _, row in time_hub.iterrows():
        timestamp = row["Timestamp"]
        for zone in time_zones:
            link_id = str(uuid.uuid4())
            time_links.append({
                "LinkID": link_id,
                "TimeID": row["TimeID"],
                "TimeZone": zone,
                "AdjustedTime": timestamp  # For simplicity, no real adjustment is made
            })
    time_link_df = pd.DataFrame(time_links)
    return time_link_df

 # Generate Data
 time_hub = create_time_hub()
 time_satellites = create_time_satellites(time_hub)
 time_links = create_time_links(time_hub)

 # Save to SQLite
 time_hub.to_sql("TimeHub", conn, if_exists="replace", index=False)
 time_satellites.to_sql("TimeSatellites", conn, if_exists="replace", index=False)
 time_links.to_sql("TimeLinks", conn, if_exists="replace", index=False)

 # Display Results
 print("Time Hub:")
 print(time_hub.head())
 print("\nTime Satellites:")
 print(time_satellites.head())
 print("\nTime Links:")
 print(time_links.head())

 # Close connection
 conn.close()

 #7)Transforming data.

 import pandas as pd

 data={
    "OrderID" : [1,2,3,4,5],
    "Product":["Laptop", "Mouse", "Keyboard", "Monitor", "Laptop"],
    "Quantity": [2, 5, 3, 1, 1],
    "Price": [1500, 20, 50, 200, 1500],
    "OrderDate": [
        "2023-01-01 10:15",
        "2023-01-01 12:30",
        "2023-01-02 09:00",
        "2023-01-03 14:45",
        "2023-01-03 16:00",
    ],

 }

 df = pd.DataFrame(data)

 # 1. Convert OrderDate to datetime
 df["OrderDate"] = pd.to_datetime(df["OrderDate"])

 # 2. Add a Total column (Quantity * Price)
 df["Total"] = df["Quantity"] * df["Price"]

 # 3. Filter data (e.g., orders after 2023-01-02)
 filter_df = df[df["OrderDate"]> "2023-01-02"]

 # 4. Group by Product and calculate total revenue and quantity sold
 grouped_df = df.groupby("Product").agg(
    TotalRevenue=pd.NamedAgg(column="Total", aggfunc="sum"),
    TotalQuantity=pd.NamedAgg(column="Quantity", aggfunc=sum)
 )

 print(df)
 print(filter_df)
 print(grouped_df)

 #Generating data.
 #9.1) Random Numerical Data :-

 import numpy as np
 import pandas as pd

 np.random.seed(42)
 data = {
    "ID": range(1, 101),
    "Age": np.random.randint(18, 60, size=100),
    "Salary": np.random.randint(30000, 120000, size=100),
 }

 df = pd.DataFrame(data)
 print(df.head())

 #9.2) Time-Series Data:-

 date_range = pd.date_range(start="2023-01-01", periods=100, freq="D")
 time_series_data = {
    "Date": date_range,
    "Temperature":np.random.uniform(15,35,size=100),
 }

 time_series_df = pd.DataFrame(time_series_data)
 print(time_series_df)

 #9.3) Categorical Data:-

 categories = ["Electronics", "Clothing", "Groceries"]
 categorical_data = {
    "ID": range(1,101),
    "Category": np.random.choice(categories, size=100),
 }

 categorical_df = pd.DataFrame(categorical_data)
 print(categorical_df)
diff --git a/PowerBI.txt b/PowerBI.txt
 ----------------------Practical 10------------------

 #PowerBI

 Steps:--

 1) Home --> Get Data --> Click on OData feed.
 2) Window Popup --> enter URL --> http://services.odata.org/V3/Northwind/Northwind.svc/ and click 'OK'.
 3) New Window Screen Will get Open --> select: 'Orders, Products, Order_Details' and then click on Transform Data button
 4) click on Products --> Keep Only ProductID, ProductName, QuantityPerUnit, and UnitsInStock columns. 
 5) Now click on UnitsInStock column and check Data Type --> If its not whole number then change it to Whole Number
 6) Now click on Orders data --> see for Order_Details column --> Then click on the opposite arrow sign of Order_Details column --> you will get new window --> select only ProductID, UnitPrice, and Quantity and click on Ok button
 7) In this step you keep only OrderDate,  ShipCity,  ShipCountry, Order_Details.ProductID, Order_Details.UnitPrice,
 and  Order_Details.Quantity columns
 Rename Order_Details.ProductID, Order_Details.UnitPrice, and  Order_Details.Quantity to ProductID,  UnitPrice,  and  Quantity
 8) In the Add Column ribbon tab, click Add Custom Column --> enter [UnitPrice] * [Quantity] --> Change Column name to LineTotal --> Now click on LineTotal and right click --> Then click on change type option and choose Decimal Number --> Then drag the LineTotal column to the left, after ShipCountry.
 9) Now In Home ribbon click on close and apply and then select close and apply
 10) Once the data is loaded, select the Manage Relationships button In Modeling ribbon  
 11) Select Relationship view in Power BI Desktop. You will get to see relatoships --> click on Report view
 12) Click on clustered column chart and then drag shipcountry to x axis and quantity to y-axis
 13) Click on map and then drag shipcountry to Legend box
	# Cassandra Steps:-

	1) CREATE KEYSPACE test WITH replication = {'class':'SimpleStrategy','replication_factor':1};
	2) USE test
	3) CREATE TABLE users (
	... user_id UUID PRIMARY KEY,
	... name TEXT,
	... email TEXT,
	... age INT
	... );
	4) INSERT INTO users(user_id, name, email, age)
	... VALUES (uuid(), 'John Doe', '[email protected]',25);
	INSERT INTO users(user_id, name, email, age)
	... VALUES (uuid(), 'Jane Smith', '[email protected]',30);
	5) SELECT * FROM users;
	6) UPDATE users SET age = 26 WHERE user_id = <user_id>;
	7) SELECT * FROM users;
	8) DELETE FROM users WHERE user_id = 98cc5678-4744-47d2-88c8-69d3214061de;
	9) SELECT * FROM users;
	10) DROP TABLE users;
	11) DROP KEYSPACE test;
	#2.1) JSON TO HORUS

	import json

	def json_to_horus(input_file, output_file):
	with open(input_file, "r") as f:
	data = json.load(f)

	def convert(obj, indent=0):
	result = ""
	for key, value in obj.items():
	if isinstance(value, dict):
	result += " " * indent + f"[{key}]\n" + convert(value, indent + 4)
	else:
	result += " " * indent + f"{key}: {value}\n"
	return result

	with open(output_file, "w") as f:
	f.write(convert(data))

	# Example usage
	json_to_horus("input.json", "output.horus")

	#2.2) Text Delimited CSV to HORUS format.

	import csv

	def csv_to_horus(input_file, output_file, delimiter=",", encoding='utf-8'):
	with open(input_file, "r", encoding=encoding) as f:
	reader = csv.reader(f, delimiter=delimiter)
	headers = next(reader) # Read header row

	horus_output = ""
	for i, row in enumerate(reader, start=1):
	horus_output += f"[Record {i}]\n"
	for key, value in zip(headers, row):
	horus_output += f" {key}: {value}\n"
	horus_output += "\n"

	with open(output_file, "w") as f:
	f.write(horus_output)

	# Example usage
	csv_to_horus("Country_Code.csv", "output1.horus", delimiter=",", encoding='latin-1')

	#3.1) Demonstrate Fixers Utilities
	#Cleaning Data

	import string
	import datetime as dt

	dataset=" Data Science with too many spaces is bad ."

	print("Before Cleaning\n",dataset)
	print("Clean Dataset \n", dataset.strip())


	#Removing Non-Printable Characters

	printable = set(string.printable)
	print(printable)

	bad_data = "Data\x00Science with\x02 funny characters is \x10bad!!!"
	clean_data = ''.join(filter(lambda x: x in string.printable, bad_data))

	print(bad_data)
	print(clean_data)

	#Reformatting data entry to match specific formatting criteria.

	baddate = dt.date(2019, 10 ,31)
	baddata = format(baddate,'%Y-%m-%d')
	print(baddata)
	gooddate = dt.datetime.strptime(baddata, '%Y-%m-%d')
	gooddata = format(gooddate, '%d %B %Y')
	print(gooddata)

	#3.2) Demonstrate Averaging of Data.

	import pandas as pd
	inputfile = "sample_large_data.csv"
	df = pd.read_csv(inputfile)

	print(df)

	processdata = df
	meandata = processdata.groupby(['Gender'])['Salary'].mean()
	print(meandata)

	#3.3)Data Binning or Bucketing.

	import pandas as pd

	raw_data = {
	"City" :
	["sawantwadi","kudal","malvan","Devgad","Dodamarg","Kankavli","Vaibhavwadi","vengurla"],
	"Rank": ["1st","2nd","1st","2nd","1st","2nd","1st","2nd"],
	"Score1" : [48,55,45,80,60,100,90,75],
	"Score2" : [20,60,85,96,55,65,30,96]
	}

	df = pd.DataFrame(raw_data,
	index = pd.Index(['A','B','C','D','E','F','G','H'], name ='letter'),
	columns= pd.Index(['City','Rank','Score1','Score2'], name = 'Attributes')
	)

	print(df)

	bins = [0,25,50,75,100]

	group = ['low' , 'average' , 'good' , 'brilliant']

	df['grade'] = pd.cut(df['Score2'],bins,labels= group)
	print(df)

	#5.1) Write R program to create the network routing diagram from the given data on routers.

	library(igraph)

	router_d <- data.frame(
	Source = c(("Router1", "Router1", "Router2", "Router2", "Router3", "Router4"),
	Destination = c("Router2", "Router3", "Router3", "Router4", "Router4", "Router5"),
	Cost = c(10, 15, 5, 20, 10, 25)
	)

	g <- graph_from_data_frame(router_d,directed =TRUE)

	plot(
	g,
	edge.label=E(g)$Cost,
	vertex.size = 30,
	vertex.label.cex =1.2,
	edge.arrow.size = 0.5,
	main = "Network Routing Diagram"
	)

	#5.2) Write a R program to build acyclic graph.

	library(igraph)

	nodes <- c("A", "B", "C", "D", "E")
	edges <- c("A", "B", "A", "C", "B", "D", "C", "E", "D", "E")

	graph <- graph(edges = edges, directed = TRUE)

	if (is_dag(graph)) {
	print("The graph is acyclic (DAG).")
	}else {
	print("The graph contains cycles.")
	}


	plot(graph, vertex.color="skyblue", vertex.size=30, edge.arrow.size=0.5, vertex.label.cex=1.2, main="Acyclic Graph")

	#6)Build the time hub, links and satellites.

	import pandas as pd
	import sqlite3
	from datetime import datetime, timedelta
	import uuid

	# Create a SQLite database (or connect to an existing one)
	conn = sqlite3.connect("time_hub.db")

	# Create Time Hub
	def create_time_hub():
	time_data = []
	base_time = datetime(2023, 1, 1, 0, 0, 0) # Starting point for time data
	for i in range(100): # Generate 100 timestamps
	current_time = base_time + timedelta(minutes=i * 15)
	time_id = str(uuid.uuid4())
	time_data.append({"TimeID": time_id, "Timestamp": current_time})
	time_hub = pd.DataFrame(time_data)
	return time_hub

	# Create Time Satellites
	def create_time_satellites(time_hub):
	time_satellites = []
	for _, row in time_hub.iterrows():
	timestamp = row["Timestamp"]
	time_satellites.append({
	"TimeID": row["TimeID"],
	"Year": timestamp.year,
	"Month": timestamp.month,
	"Day": timestamp.day,
	"Hour": timestamp.hour,
	"Minute": timestamp.minute
	})
	time_satellite_df = pd.DataFrame(time_satellites)
	return time_satellite_df

	# Create Time Links
	def create_time_links(time_hub):
	time_zones = ["UTC", "EST", "PST", "CET"]
	time_links = []
	for _, row in time_hub.iterrows():
	timestamp = row["Timestamp"]
	for zone in time_zones:
	link_id = str(uuid.uuid4())
	time_links.append({
	"LinkID": link_id,
	"TimeID": row["TimeID"],
	"TimeZone": zone,
	"AdjustedTime": timestamp # For simplicity, no real adjustment is made
	})
	time_link_df = pd.DataFrame(time_links)
	return time_link_df

	# Generate Data
	time_hub = create_time_hub()
	time_satellites = create_time_satellites(time_hub)
	time_links = create_time_links(time_hub)

	# Save to SQLite
	time_hub.to_sql("TimeHub", conn, if_exists="replace", index=False)
	time_satellites.to_sql("TimeSatellites", conn, if_exists="replace", index=False)
	time_links.to_sql("TimeLinks", conn, if_exists="replace", index=False)

	# Display Results
	print("Time Hub:")
	print(time_hub.head())
	print("\nTime Satellites:")
	print(time_satellites.head())
	print("\nTime Links:")
	print(time_links.head())

	# Close connection
	conn.close()

	#7)Transforming data.

	import pandas as pd

	data={
	"OrderID" : [1,2,3,4,5],
	"Product":["Laptop", "Mouse", "Keyboard", "Monitor", "Laptop"],
	"Quantity": [2, 5, 3, 1, 1],
	"Price": [1500, 20, 50, 200, 1500],
	"OrderDate": [
	"2023-01-01 10:15",
	"2023-01-01 12:30",
	"2023-01-02 09:00",
	"2023-01-03 14:45",
	"2023-01-03 16:00",
	],

	}

	df = pd.DataFrame(data)

	# 1. Convert OrderDate to datetime
	df["OrderDate"] = pd.to_datetime(df["OrderDate"])

	# 2. Add a Total column (Quantity * Price)
	df["Total"] = df["Quantity"] * df["Price"]

	# 3. Filter data (e.g., orders after 2023-01-02)
	filter_df = df[df["OrderDate"]> "2023-01-02"]

	# 4. Group by Product and calculate total revenue and quantity sold
	grouped_df = df.groupby("Product").agg(
	TotalRevenue=pd.NamedAgg(column="Total", aggfunc="sum"),
	TotalQuantity=pd.NamedAgg(column="Quantity", aggfunc=sum)
	)

	print(df)
	print(filter_df)
	print(grouped_df)

	#Generating data.
	#9.1) Random Numerical Data :-

	import numpy as np
	import pandas as pd

	np.random.seed(42)
	data = {
	"ID": range(1, 101),
	"Age": np.random.randint(18, 60, size=100),
	"Salary": np.random.randint(30000, 120000, size=100),
	}

	df = pd.DataFrame(data)
	print(df.head())

	#9.2) Time-Series Data:-

	date_range = pd.date_range(start="2023-01-01", periods=100, freq="D")
	time_series_data = {
	"Date": date_range,
	"Temperature":np.random.uniform(15,35,size=100),
	}

	time_series_df = pd.DataFrame(time_series_data)
	print(time_series_df)

	#9.3) Categorical Data:-

	categories = ["Electronics", "Clothing", "Groceries"]
	categorical_data = {
	"ID": range(1,101),
	"Category": np.random.choice(categories, size=100),
	}

	categorical_df = pd.DataFrame(categorical_data)
	print(categorical_df)
	----------------------Practical 10------------------

	#PowerBI

	Steps:--

	1) Home --> Get Data --> Click on OData feed.
	2) Window Popup --> enter URL --> http://services.odata.org/V3/Northwind/Northwind.svc/ and click 'OK'.
	3) New Window Screen Will get Open --> select: 'Orders, Products, Order_Details' and then click on Transform Data button
	4) click on Products --> Keep Only ProductID, ProductName, QuantityPerUnit, and UnitsInStock columns.
	5) Now click on UnitsInStock column and check Data Type --> If its not whole number then change it to Whole Number
	6) Now click on Orders data --> see for Order_Details column --> Then click on the opposite arrow sign of Order_Details column --> you will get new window --> select only ProductID, UnitPrice, and Quantity and click on Ok button
	7) In this step you keep only OrderDate, ShipCity, ShipCountry, Order_Details.ProductID, Order_Details.UnitPrice,
	and Order_Details.Quantity columns
	Rename Order_Details.ProductID, Order_Details.UnitPrice, and Order_Details.Quantity to ProductID, UnitPrice, and Quantity
	8) In the Add Column ribbon tab, click Add Custom Column --> enter [UnitPrice] * [Quantity] --> Change Column name to LineTotal --> Now click on LineTotal and right click --> Then click on change type option and choose Decimal Number --> Then drag the LineTotal column to the left, after ShipCountry.
	9) Now In Home ribbon click on close and apply and then select close and apply
	10) Once the data is loaded, select the Manage Relationships button In Modeling ribbon
	11) Select Relationship view in Power BI Desktop. You will get to see relatoships --> click on Report view
	12) Click on clustered column chart and then drag shipcountry to x axis and quantity to y-axis
	13) Click on map and then drag shipcountry to Legend box