Last active
February 2, 2025 19:02
-
-
Save MU2324/d820ae51c04bad1b66539bb760b50ada to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Cassandra Steps:- | |
1) CREATE KEYSPACE test WITH replication = {'class':'SimpleStrategy','replication_factor':1}; | |
2) USE test | |
3) CREATE TABLE users ( | |
... user_id UUID PRIMARY KEY, | |
... name TEXT, | |
... email TEXT, | |
... age INT | |
... ); | |
4) INSERT INTO users(user_id, name, email, age) | |
... VALUES (uuid(), 'John Doe', '[email protected]',25); | |
INSERT INTO users(user_id, name, email, age) | |
... VALUES (uuid(), 'Jane Smith', '[email protected]',30); | |
5) SELECT * FROM users; | |
6) UPDATE users SET age = 26 WHERE user_id = <user_id>; | |
7) SELECT * FROM users; | |
8) DELETE FROM users WHERE user_id = 98cc5678-4744-47d2-88c8-69d3214061de; | |
9) SELECT * FROM users; | |
10) DROP TABLE users; | |
11) DROP KEYSPACE test; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#2.1) JSON TO HORUS | |
import json | |
def json_to_horus(input_file, output_file): | |
with open(input_file, "r") as f: | |
data = json.load(f) | |
def convert(obj, indent=0): | |
result = "" | |
for key, value in obj.items(): | |
if isinstance(value, dict): | |
result += " " * indent + f"[{key}]\n" + convert(value, indent + 4) | |
else: | |
result += " " * indent + f"{key}: {value}\n" | |
return result | |
with open(output_file, "w") as f: | |
f.write(convert(data)) | |
# Example usage | |
json_to_horus("input.json", "output.horus") | |
#2.2) Text Delimited CSV to HORUS format. | |
import csv | |
def csv_to_horus(input_file, output_file, delimiter=",", encoding='utf-8'): | |
with open(input_file, "r", encoding=encoding) as f: | |
reader = csv.reader(f, delimiter=delimiter) | |
headers = next(reader) # Read header row | |
horus_output = "" | |
for i, row in enumerate(reader, start=1): | |
horus_output += f"[Record {i}]\n" | |
for key, value in zip(headers, row): | |
horus_output += f" {key}: {value}\n" | |
horus_output += "\n" | |
with open(output_file, "w") as f: | |
f.write(horus_output) | |
# Example usage | |
csv_to_horus("Country_Code.csv", "output1.horus", delimiter=",", encoding='latin-1') | |
#3.1) Demonstrate Fixers Utilities | |
#Cleaning Data | |
import string | |
import datetime as dt | |
dataset=" Data Science with too many spaces is bad ." | |
print("Before Cleaning\n",dataset) | |
print("Clean Dataset \n", dataset.strip()) | |
#Removing Non-Printable Characters | |
printable = set(string.printable) | |
print(printable) | |
bad_data = "Data\x00Science with\x02 funny characters is \x10bad!!!" | |
clean_data = ''.join(filter(lambda x: x in string.printable, bad_data)) | |
print(bad_data) | |
print(clean_data) | |
#Reformatting data entry to match specific formatting criteria. | |
baddate = dt.date(2019, 10 ,31) | |
baddata = format(baddate,'%Y-%m-%d') | |
print(baddata) | |
gooddate = dt.datetime.strptime(baddata, '%Y-%m-%d') | |
gooddata = format(gooddate, '%d %B %Y') | |
print(gooddata) | |
#3.2) Demonstrate Averaging of Data. | |
import pandas as pd | |
inputfile = "sample_large_data.csv" | |
df = pd.read_csv(inputfile) | |
print(df) | |
processdata = df | |
meandata = processdata.groupby(['Gender'])['Salary'].mean() | |
print(meandata) | |
#3.3)Data Binning or Bucketing. | |
import pandas as pd | |
raw_data = { | |
"City" : | |
["sawantwadi","kudal","malvan","Devgad","Dodamarg","Kankavli","Vaibhavwadi","vengurla"], | |
"Rank": ["1st","2nd","1st","2nd","1st","2nd","1st","2nd"], | |
"Score1" : [48,55,45,80,60,100,90,75], | |
"Score2" : [20,60,85,96,55,65,30,96] | |
} | |
df = pd.DataFrame(raw_data, | |
index = pd.Index(['A','B','C','D','E','F','G','H'], name ='letter'), | |
columns= pd.Index(['City','Rank','Score1','Score2'], name = 'Attributes') | |
) | |
print(df) | |
bins = [0,25,50,75,100] | |
group = ['low' , 'average' , 'good' , 'brilliant'] | |
df['grade'] = pd.cut(df['Score2'],bins,labels= group) | |
print(df) | |
#5.1) Write R program to create the network routing diagram from the given data on routers. | |
library(igraph) | |
router_d <- data.frame( | |
Source = c(("Router1", "Router1", "Router2", "Router2", "Router3", "Router4"), | |
Destination = c("Router2", "Router3", "Router3", "Router4", "Router4", "Router5"), | |
Cost = c(10, 15, 5, 20, 10, 25) | |
) | |
g <- graph_from_data_frame(router_d,directed =TRUE) | |
plot( | |
g, | |
edge.label=E(g)$Cost, | |
vertex.size = 30, | |
vertex.label.cex =1.2, | |
edge.arrow.size = 0.5, | |
main = "Network Routing Diagram" | |
) | |
#5.2) Write a R program to build acyclic graph. | |
library(igraph) | |
nodes <- c("A", "B", "C", "D", "E") | |
edges <- c("A", "B", "A", "C", "B", "D", "C", "E", "D", "E") | |
graph <- graph(edges = edges, directed = TRUE) | |
if (is_dag(graph)) { | |
print("The graph is acyclic (DAG).") | |
}else { | |
print("The graph contains cycles.") | |
} | |
plot(graph, vertex.color="skyblue", vertex.size=30, edge.arrow.size=0.5, vertex.label.cex=1.2, main="Acyclic Graph") | |
#6)Build the time hub, links and satellites. | |
import pandas as pd | |
import sqlite3 | |
from datetime import datetime, timedelta | |
import uuid | |
# Create a SQLite database (or connect to an existing one) | |
conn = sqlite3.connect("time_hub.db") | |
# Create Time Hub | |
def create_time_hub(): | |
time_data = [] | |
base_time = datetime(2023, 1, 1, 0, 0, 0) # Starting point for time data | |
for i in range(100): # Generate 100 timestamps | |
current_time = base_time + timedelta(minutes=i * 15) | |
time_id = str(uuid.uuid4()) | |
time_data.append({"TimeID": time_id, "Timestamp": current_time}) | |
time_hub = pd.DataFrame(time_data) | |
return time_hub | |
# Create Time Satellites | |
def create_time_satellites(time_hub): | |
time_satellites = [] | |
for _, row in time_hub.iterrows(): | |
timestamp = row["Timestamp"] | |
time_satellites.append({ | |
"TimeID": row["TimeID"], | |
"Year": timestamp.year, | |
"Month": timestamp.month, | |
"Day": timestamp.day, | |
"Hour": timestamp.hour, | |
"Minute": timestamp.minute | |
}) | |
time_satellite_df = pd.DataFrame(time_satellites) | |
return time_satellite_df | |
# Create Time Links | |
def create_time_links(time_hub): | |
time_zones = ["UTC", "EST", "PST", "CET"] | |
time_links = [] | |
for _, row in time_hub.iterrows(): | |
timestamp = row["Timestamp"] | |
for zone in time_zones: | |
link_id = str(uuid.uuid4()) | |
time_links.append({ | |
"LinkID": link_id, | |
"TimeID": row["TimeID"], | |
"TimeZone": zone, | |
"AdjustedTime": timestamp # For simplicity, no real adjustment is made | |
}) | |
time_link_df = pd.DataFrame(time_links) | |
return time_link_df | |
# Generate Data | |
time_hub = create_time_hub() | |
time_satellites = create_time_satellites(time_hub) | |
time_links = create_time_links(time_hub) | |
# Save to SQLite | |
time_hub.to_sql("TimeHub", conn, if_exists="replace", index=False) | |
time_satellites.to_sql("TimeSatellites", conn, if_exists="replace", index=False) | |
time_links.to_sql("TimeLinks", conn, if_exists="replace", index=False) | |
# Display Results | |
print("Time Hub:") | |
print(time_hub.head()) | |
print("\nTime Satellites:") | |
print(time_satellites.head()) | |
print("\nTime Links:") | |
print(time_links.head()) | |
# Close connection | |
conn.close() | |
#7)Transforming data. | |
import pandas as pd | |
data={ | |
"OrderID" : [1,2,3,4,5], | |
"Product":["Laptop", "Mouse", "Keyboard", "Monitor", "Laptop"], | |
"Quantity": [2, 5, 3, 1, 1], | |
"Price": [1500, 20, 50, 200, 1500], | |
"OrderDate": [ | |
"2023-01-01 10:15", | |
"2023-01-01 12:30", | |
"2023-01-02 09:00", | |
"2023-01-03 14:45", | |
"2023-01-03 16:00", | |
], | |
} | |
df = pd.DataFrame(data) | |
# 1. Convert OrderDate to datetime | |
df["OrderDate"] = pd.to_datetime(df["OrderDate"]) | |
# 2. Add a Total column (Quantity * Price) | |
df["Total"] = df["Quantity"] * df["Price"] | |
# 3. Filter data (e.g., orders after 2023-01-02) | |
filter_df = df[df["OrderDate"]> "2023-01-02"] | |
# 4. Group by Product and calculate total revenue and quantity sold | |
grouped_df = df.groupby("Product").agg( | |
TotalRevenue=pd.NamedAgg(column="Total", aggfunc="sum"), | |
TotalQuantity=pd.NamedAgg(column="Quantity", aggfunc=sum) | |
) | |
print(df) | |
print(filter_df) | |
print(grouped_df) | |
#Generating data. | |
#9.1) Random Numerical Data :- | |
import numpy as np | |
import pandas as pd | |
np.random.seed(42) | |
data = { | |
"ID": range(1, 101), | |
"Age": np.random.randint(18, 60, size=100), | |
"Salary": np.random.randint(30000, 120000, size=100), | |
} | |
df = pd.DataFrame(data) | |
print(df.head()) | |
#9.2) Time-Series Data:- | |
date_range = pd.date_range(start="2023-01-01", periods=100, freq="D") | |
time_series_data = { | |
"Date": date_range, | |
"Temperature":np.random.uniform(15,35,size=100), | |
} | |
time_series_df = pd.DataFrame(time_series_data) | |
print(time_series_df) | |
#9.3) Categorical Data:- | |
categories = ["Electronics", "Clothing", "Groceries"] | |
categorical_data = { | |
"ID": range(1,101), | |
"Category": np.random.choice(categories, size=100), | |
} | |
categorical_df = pd.DataFrame(categorical_data) | |
print(categorical_df) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
----------------------Practical 10------------------ | |
#PowerBI | |
Steps:-- | |
1) Home --> Get Data --> Click on OData feed. | |
2) Window Popup --> enter URL --> http://services.odata.org/V3/Northwind/Northwind.svc/ and click 'OK'. | |
3) New Window Screen Will get Open --> select: 'Orders, Products, Order_Details' and then click on Transform Data button | |
4) click on Products --> Keep Only ProductID, ProductName, QuantityPerUnit, and UnitsInStock columns. | |
5) Now click on UnitsInStock column and check Data Type --> If its not whole number then change it to Whole Number | |
6) Now click on Orders data --> see for Order_Details column --> Then click on the opposite arrow sign of Order_Details column --> you will get new window --> select only ProductID, UnitPrice, and Quantity and click on Ok button | |
7) In this step you keep only OrderDate, ShipCity, ShipCountry, Order_Details.ProductID, Order_Details.UnitPrice, | |
and Order_Details.Quantity columns | |
Rename Order_Details.ProductID, Order_Details.UnitPrice, and Order_Details.Quantity to ProductID, UnitPrice, and Quantity | |
8) In the Add Column ribbon tab, click Add Custom Column --> enter [UnitPrice] * [Quantity] --> Change Column name to LineTotal --> Now click on LineTotal and right click --> Then click on change type option and choose Decimal Number --> Then drag the LineTotal column to the left, after ShipCountry. | |
9) Now In Home ribbon click on close and apply and then select close and apply | |
10) Once the data is loaded, select the Manage Relationships button In Modeling ribbon | |
11) Select Relationship view in Power BI Desktop. You will get to see relatoships --> click on Report view | |
12) Click on clustered column chart and then drag shipcountry to x axis and quantity to y-axis | |
13) Click on map and then drag shipcountry to Legend box |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment