Skip to content

Instantly share code, notes, and snippets.

@MU2324
Last active February 2, 2025 19:02
Show Gist options
  • Save MU2324/d820ae51c04bad1b66539bb760b50ada to your computer and use it in GitHub Desktop.
Save MU2324/d820ae51c04bad1b66539bb760b50ada to your computer and use it in GitHub Desktop.
# Cassandra Steps:-
1) CREATE KEYSPACE test WITH replication = {'class':'SimpleStrategy','replication_factor':1};
2) USE test
3) CREATE TABLE users (
... user_id UUID PRIMARY KEY,
... name TEXT,
... email TEXT,
... age INT
... );
4) INSERT INTO users(user_id, name, email, age)
... VALUES (uuid(), 'John Doe', '[email protected]',25);
INSERT INTO users(user_id, name, email, age)
... VALUES (uuid(), 'Jane Smith', '[email protected]',30);
5) SELECT * FROM users;
6) UPDATE users SET age = 26 WHERE user_id = <user_id>;
7) SELECT * FROM users;
8) DELETE FROM users WHERE user_id = 98cc5678-4744-47d2-88c8-69d3214061de;
9) SELECT * FROM users;
10) DROP TABLE users;
11) DROP KEYSPACE test;
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
#2.1) JSON TO HORUS
import json
def json_to_horus(input_file, output_file):
with open(input_file, "r") as f:
data = json.load(f)
def convert(obj, indent=0):
result = ""
for key, value in obj.items():
if isinstance(value, dict):
result += " " * indent + f"[{key}]\n" + convert(value, indent + 4)
else:
result += " " * indent + f"{key}: {value}\n"
return result
with open(output_file, "w") as f:
f.write(convert(data))
# Example usage
json_to_horus("input.json", "output.horus")
#2.2) Text Delimited CSV to HORUS format.
import csv
def csv_to_horus(input_file, output_file, delimiter=",", encoding='utf-8'):
with open(input_file, "r", encoding=encoding) as f:
reader = csv.reader(f, delimiter=delimiter)
headers = next(reader) # Read header row
horus_output = ""
for i, row in enumerate(reader, start=1):
horus_output += f"[Record {i}]\n"
for key, value in zip(headers, row):
horus_output += f" {key}: {value}\n"
horus_output += "\n"
with open(output_file, "w") as f:
f.write(horus_output)
# Example usage
csv_to_horus("Country_Code.csv", "output1.horus", delimiter=",", encoding='latin-1')
#3.1) Demonstrate Fixers Utilities
#Cleaning Data
import string
import datetime as dt
dataset=" Data Science with too many spaces is bad ."
print("Before Cleaning\n",dataset)
print("Clean Dataset \n", dataset.strip())
#Removing Non-Printable Characters
printable = set(string.printable)
print(printable)
bad_data = "Data\x00Science with\x02 funny characters is \x10bad!!!"
clean_data = ''.join(filter(lambda x: x in string.printable, bad_data))
print(bad_data)
print(clean_data)
#Reformatting data entry to match specific formatting criteria.
baddate = dt.date(2019, 10 ,31)
baddata = format(baddate,'%Y-%m-%d')
print(baddata)
gooddate = dt.datetime.strptime(baddata, '%Y-%m-%d')
gooddata = format(gooddate, '%d %B %Y')
print(gooddata)
#3.2) Demonstrate Averaging of Data.
import pandas as pd
inputfile = "sample_large_data.csv"
df = pd.read_csv(inputfile)
print(df)
processdata = df
meandata = processdata.groupby(['Gender'])['Salary'].mean()
print(meandata)
#3.3)Data Binning or Bucketing.
import pandas as pd
raw_data = {
"City" :
["sawantwadi","kudal","malvan","Devgad","Dodamarg","Kankavli","Vaibhavwadi","vengurla"],
"Rank": ["1st","2nd","1st","2nd","1st","2nd","1st","2nd"],
"Score1" : [48,55,45,80,60,100,90,75],
"Score2" : [20,60,85,96,55,65,30,96]
}
df = pd.DataFrame(raw_data,
index = pd.Index(['A','B','C','D','E','F','G','H'], name ='letter'),
columns= pd.Index(['City','Rank','Score1','Score2'], name = 'Attributes')
)
print(df)
bins = [0,25,50,75,100]
group = ['low' , 'average' , 'good' , 'brilliant']
df['grade'] = pd.cut(df['Score2'],bins,labels= group)
print(df)
#5.1) Write R program to create the network routing diagram from the given data on routers.
library(igraph)
router_d <- data.frame(
Source = c(("Router1", "Router1", "Router2", "Router2", "Router3", "Router4"),
Destination = c("Router2", "Router3", "Router3", "Router4", "Router4", "Router5"),
Cost = c(10, 15, 5, 20, 10, 25)
)
g <- graph_from_data_frame(router_d,directed =TRUE)
plot(
g,
edge.label=E(g)$Cost,
vertex.size = 30,
vertex.label.cex =1.2,
edge.arrow.size = 0.5,
main = "Network Routing Diagram"
)
#5.2) Write a R program to build acyclic graph.
library(igraph)
nodes <- c("A", "B", "C", "D", "E")
edges <- c("A", "B", "A", "C", "B", "D", "C", "E", "D", "E")
graph <- graph(edges = edges, directed = TRUE)
if (is_dag(graph)) {
print("The graph is acyclic (DAG).")
}else {
print("The graph contains cycles.")
}
plot(graph, vertex.color="skyblue", vertex.size=30, edge.arrow.size=0.5, vertex.label.cex=1.2, main="Acyclic Graph")
#6)Build the time hub, links and satellites.
import pandas as pd
import sqlite3
from datetime import datetime, timedelta
import uuid
# Create a SQLite database (or connect to an existing one)
conn = sqlite3.connect("time_hub.db")
# Create Time Hub
def create_time_hub():
time_data = []
base_time = datetime(2023, 1, 1, 0, 0, 0) # Starting point for time data
for i in range(100): # Generate 100 timestamps
current_time = base_time + timedelta(minutes=i * 15)
time_id = str(uuid.uuid4())
time_data.append({"TimeID": time_id, "Timestamp": current_time})
time_hub = pd.DataFrame(time_data)
return time_hub
# Create Time Satellites
def create_time_satellites(time_hub):
time_satellites = []
for _, row in time_hub.iterrows():
timestamp = row["Timestamp"]
time_satellites.append({
"TimeID": row["TimeID"],
"Year": timestamp.year,
"Month": timestamp.month,
"Day": timestamp.day,
"Hour": timestamp.hour,
"Minute": timestamp.minute
})
time_satellite_df = pd.DataFrame(time_satellites)
return time_satellite_df
# Create Time Links
def create_time_links(time_hub):
time_zones = ["UTC", "EST", "PST", "CET"]
time_links = []
for _, row in time_hub.iterrows():
timestamp = row["Timestamp"]
for zone in time_zones:
link_id = str(uuid.uuid4())
time_links.append({
"LinkID": link_id,
"TimeID": row["TimeID"],
"TimeZone": zone,
"AdjustedTime": timestamp # For simplicity, no real adjustment is made
})
time_link_df = pd.DataFrame(time_links)
return time_link_df
# Generate Data
time_hub = create_time_hub()
time_satellites = create_time_satellites(time_hub)
time_links = create_time_links(time_hub)
# Save to SQLite
time_hub.to_sql("TimeHub", conn, if_exists="replace", index=False)
time_satellites.to_sql("TimeSatellites", conn, if_exists="replace", index=False)
time_links.to_sql("TimeLinks", conn, if_exists="replace", index=False)
# Display Results
print("Time Hub:")
print(time_hub.head())
print("\nTime Satellites:")
print(time_satellites.head())
print("\nTime Links:")
print(time_links.head())
# Close connection
conn.close()
#7)Transforming data.
import pandas as pd
data={
"OrderID" : [1,2,3,4,5],
"Product":["Laptop", "Mouse", "Keyboard", "Monitor", "Laptop"],
"Quantity": [2, 5, 3, 1, 1],
"Price": [1500, 20, 50, 200, 1500],
"OrderDate": [
"2023-01-01 10:15",
"2023-01-01 12:30",
"2023-01-02 09:00",
"2023-01-03 14:45",
"2023-01-03 16:00",
],
}
df = pd.DataFrame(data)
# 1. Convert OrderDate to datetime
df["OrderDate"] = pd.to_datetime(df["OrderDate"])
# 2. Add a Total column (Quantity * Price)
df["Total"] = df["Quantity"] * df["Price"]
# 3. Filter data (e.g., orders after 2023-01-02)
filter_df = df[df["OrderDate"]> "2023-01-02"]
# 4. Group by Product and calculate total revenue and quantity sold
grouped_df = df.groupby("Product").agg(
TotalRevenue=pd.NamedAgg(column="Total", aggfunc="sum"),
TotalQuantity=pd.NamedAgg(column="Quantity", aggfunc=sum)
)
print(df)
print(filter_df)
print(grouped_df)
#Generating data.
#9.1) Random Numerical Data :-
import numpy as np
import pandas as pd
np.random.seed(42)
data = {
"ID": range(1, 101),
"Age": np.random.randint(18, 60, size=100),
"Salary": np.random.randint(30000, 120000, size=100),
}
df = pd.DataFrame(data)
print(df.head())
#9.2) Time-Series Data:-
date_range = pd.date_range(start="2023-01-01", periods=100, freq="D")
time_series_data = {
"Date": date_range,
"Temperature":np.random.uniform(15,35,size=100),
}
time_series_df = pd.DataFrame(time_series_data)
print(time_series_df)
#9.3) Categorical Data:-
categories = ["Electronics", "Clothing", "Groceries"]
categorical_data = {
"ID": range(1,101),
"Category": np.random.choice(categories, size=100),
}
categorical_df = pd.DataFrame(categorical_data)
print(categorical_df)
----------------------Practical 10------------------
#PowerBI
Steps:--
1) Home --> Get Data --> Click on OData feed.
2) Window Popup --> enter URL --> http://services.odata.org/V3/Northwind/Northwind.svc/ and click 'OK'.
3) New Window Screen Will get Open --> select: 'Orders, Products, Order_Details' and then click on Transform Data button
4) click on Products --> Keep Only ProductID, ProductName, QuantityPerUnit, and UnitsInStock columns.
5) Now click on UnitsInStock column and check Data Type --> If its not whole number then change it to Whole Number
6) Now click on Orders data --> see for Order_Details column --> Then click on the opposite arrow sign of Order_Details column --> you will get new window --> select only ProductID, UnitPrice, and Quantity and click on Ok button
7) In this step you keep only OrderDate, ShipCity, ShipCountry, Order_Details.ProductID, Order_Details.UnitPrice,
and Order_Details.Quantity columns
Rename Order_Details.ProductID, Order_Details.UnitPrice, and Order_Details.Quantity to ProductID, UnitPrice, and Quantity
8) In the Add Column ribbon tab, click Add Custom Column --> enter [UnitPrice] * [Quantity] --> Change Column name to LineTotal --> Now click on LineTotal and right click --> Then click on change type option and choose Decimal Number --> Then drag the LineTotal column to the left, after ShipCountry.
9) Now In Home ribbon click on close and apply and then select close and apply
10) Once the data is loaded, select the Manage Relationships button In Modeling ribbon
11) Select Relationship view in Power BI Desktop. You will get to see relatoships --> click on Report view
12) Click on clustered column chart and then drag shipcountry to x axis and quantity to y-axis
13) Click on map and then drag shipcountry to Legend box
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment