-
-
Save misraX/399f19920821758cda586c2e82aedc65 to your computer and use it in GitHub Desktop.
#!/bin/bash | |
# | |
# Created By: misraX | |
# Github: github.com/misrax | |
# License: MIT | |
# CopyWrite: 2017 | |
# Bash script to install and generate pip requirements.txt from inside a virtualenv. | |
# By using pip freeze to creat a list of all the virtualenviroment pip packeges. | |
# It's just a simple way of creating the pip requirements.txt for a development use. | |
# usage pip-install package1 package2 package3 .. package(n)... etc | |
# | |
packages=($1) | |
echo "Starting to install packages, Hooopa....." | |
#check virtualenv path | |
if [[ "$VIRTUAL_ENV" != "" ]]; then | |
echo "You are in a working virtualenv $VIRTUAL_ENV"; | |
# virtual_env > check if packages is empty .. if [[ ]]; then | |
if [[ "$packages" != "" ]]; then | |
pip install "${packages[@]}"; | |
echo "Whre do u wanna save your requirements.txt type the path and click [ENTER]" | |
ls "$VIRTUAL_ENV" | |
read requirements_path | |
# virtual_env > > package > if read is empty ... if [[ ]]; then | |
if [[ "$requirements_path" == "" ]]; then | |
echo "Oh pleae, Enter something :D, now to call this script again use prepare" | |
exit 1; | |
# virtual_env > packages > read > if virtualevn/read is a valid directory.. elif [[ ]]; then | |
elif [[ -d "$VIRTUAL_ENV/$requirements_path" ]]; then | |
#statements | |
echo "creating requirements.txt"; | |
pip freeze > "$VIRTUAL_ENV/$requirements_path/requirements.txt" | |
# > virtul_env > package > read > not a valid directory .. else; fi | |
else | |
echo "This is not a vail directory under $(ls $VIRTUAL_ENV) path" | |
fi | |
# virtual_env > packages .. else; fi | |
else | |
echo "Please enter at least one package to install"; | |
exit 1; | |
fi | |
# virtual_env .. else; fi | |
else | |
echo "You are not in a working virtualenv" | |
echo "Exiting .........." | |
exit 1; | |
fi | |
# | |
#TODO | |
#adding enviroments variable to set DJANG_PATH and REQUIRMETS_PATH. | |
# |
import os
import pandas as pd
import numpy as np
from math import radians, sin, cos, atan2, sqrt
from datetime import datetime
-----------------------------------------------------------------------------
CONFIGURATION
-----------------------------------------------------------------------------
DATA_FOLDER = "./" # Folder where your Excel files are stored
EXCEL_FILES = [
# List all your Excel filenames here:
"WK.xlsx",
"27727861641.xlsx",
"G(1).xlsx",
"Schalk Pienaar.xlsx",
"CR.xlsx",
"BP_1123383_-Police_Case_Query-_AMA.xlsx",
"summary of cell numbers.xlsx",
"Manie 27823078393_1.xlsx",
"Manie 27823078393_2.xlsx"
]
Update these lists as needed or glob them automatically if you prefer.
Phone number references (update to match your data format exactly)
MANIE = "+27823078393"
SP = "+27824634777"
GG = "+27637762604"
JU = "+27727861641"
CALLIE = "+27834124723"
WK = "+27825248358" # Example victim
Columns in your Excel data (update to match your actual column names)
COL_CALLING = "CallingNumber" # e.g. "MSISDN" or "CallingNumber"
COL_CALLED = "CalledNumber" # e.g. "Other Party"
COL_LAT = "Latitude"
COL_LON = "Longitude"
COL_DATE = "Start Date" # e.g. "StartDate", "Date/Time"
COL_DURATION = "Call Duration" # e.g. "Call Duration", "Duration"
-----------------------------------------------------------------------------
HELPER FUNCTIONS
-----------------------------------------------------------------------------
def load_all_excel(files, folder=DATA_FOLDER):
"""
Loads and concatenates all Excel files into a single pandas DataFrame.
"""
combined_df = pd.DataFrame()
for f in files:
path = os.path.join(folder, f)
if os.path.exists(path):
df = pd.read_excel(path)
combined_df = pd.concat([combined_df, df], ignore_index=True)
else:
print(f"WARNING: File not found -> {path}")
return combined_df
def haversine_distance(lat1, lon1, lat2, lon2):
"""
Calculate Haversine distance (in meters) between two lat/lon pairs.
"""
R = 6371_000 # Radius of Earth in meters
phi1, phi2 = radians(lat1), radians(lat2)
dphi = radians(lat2 - lat1)
dlambda = radians(lon2 - lon1)
a = sin(dphi / 2)**2 + cos(phi1) * cos(phi2) * sin(dlambda / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
return R * c
-----------------------------------------------------------------------------
MAIN ANALYSIS
-----------------------------------------------------------------------------
def main():
# 1. Load & clean data
df = load_all_excel(EXCEL_FILES, DATA_FOLDER)
if df.empty:
print("No data loaded. Please check your file paths and names.")
return
# Ensure consistent column names (if needed)
# E.g., rename columns if they differ:
# df.rename(columns={
# "MSISDN": "CallingNumber",
# "Other Party": "CalledNumber",
# "StartDate": "Start Date",
# "Call Duration": "Call Duration",
# }, inplace=True)
# 2. Convert date column to datetime
df[COL_DATE] = pd.to_datetime(df[COL_DATE], errors='coerce')
# -------------------------------------------------------------------------
# (A) Instances where 2+ phones were within 15m at the same time
# -------------------------------------------------------------------------
# Approach:
# 1) Sort df by time
# 2) For each record, compare location with other records at "roughly" the same time
# 3) If distance <= 15m, store in a results list
#
# This can be time-consuming if you have large data sets. For large data,
# consider more efficient methods (e.g., spatial indexing, chunking).
#
# We'll do a naive approach for demonstration:
df_sorted = df.dropna(subset=[COL_LAT, COL_LON, COL_DATE]).sort_values(by=COL_DATE)
records = df_sorted.to_dict(orient='records')
proximity_results = []
# Let's define a time window (e.g., same minute or same second).
# For "at the same time," you might define a threshold, e.g. ±30 seconds.
TIME_THRESHOLD = pd.Timedelta(seconds=30)
for i in range(len(records)):
r1 = records[i]
for j in range(i+1, len(records)):
r2 = records[j]
# If r2 is beyond the time threshold, break (since it's sorted by time)
if abs(r2[COL_DATE] - r1[COL_DATE]) > TIME_THRESHOLD:
break
# Calculate distance
dist = haversine_distance(r1[COL_LAT], r1[COL_LON], r2[COL_LAT], r2[COL_LON])
if dist <= 15:
proximity_results.append({
"Phone1": r1[COL_CALLING],
"Phone2": r1[COL_CALLED],
"Phone1_Lat": r1[COL_LAT],
"Phone1_Lon": r1[COL_LON],
"Phone2_Calling": r2[COL_CALLING],
"Phone2_Called": r2[COL_CALLED],
"Phone2_Lat": r2[COL_LAT],
"Phone2_Lon": r2[COL_LON],
"Distance_m": round(dist, 2),
"Time1": r1[COL_DATE],
"Time2": r2[COL_DATE],
})
proximity_df = pd.DataFrame(proximity_results)
proximity_df.to_csv("proximity_15m.csv", index=False)
print(f"(A) Proximity analysis complete. Results -> proximity_15m.csv")
# -------------------------------------------------------------------------
# (B) Communication between Manie, SP, GG, and JU
# -------------------------------------------------------------------------
# We'll define a set of interest and see if either CallingNumber or CalledNumber is in that set.
interest_set = {MANIE, SP, GG, JU}
def in_interest_set(row):
return (row[COL_CALLING] in interest_set) or (row[COL_CALLED] in interest_set)
df_comm = df[df.apply(in_interest_set, axis=1)].copy()
df_comm.to_csv("communication_manie_sp_gg_ju.csv", index=False)
print("(B) Communication between Manie, SP, GG, and JU -> communication_manie_sp_gg_ju.csv")
# -------------------------------------------------------------------------
# (C) Interactions between GG, Callie, and JU in 14/11/2022 - 22/11/2022
# -------------------------------------------------------------------------
# Filter by date range and phone numbers
start_date = pd.to_datetime("2022-11-14")
end_date = pd.to_datetime("2022-11-22") # inclusive or exclusive, adjust as needed
# We consider calls from or to GG, Callie, JU
subset_phones = {GG, CALLIE, JU}
df_subset = df[
(df[COL_DATE] >= start_date) &
(df[COL_DATE] <= end_date) &
(
(df[COL_CALLING].isin(subset_phones)) |
(df[COL_CALLED].isin(subset_phones))
)
].copy()
df_subset.to_csv("gg_callie_ju_14_11_to_22_11.csv", index=False)
print("(C) Interactions for GG, Callie, JU (14/11/2022 - 22/11/2022) -> gg_callie_ju_14_11_to_22_11.csv")
# -------------------------------------------------------------------------
# (D) Interactions between GG and JU
# -------------------------------------------------------------------------
# Simple filter for calls or SMS between GG & JU in the entire dataset.
mask_gg_ju = (
((df[COL_CALLING] == GG) & (df[COL_CALLED] == JU)) |
((df[COL_CALLING] == JU) & (df[COL_CALLED] == GG))
)
df_gg_ju = df[mask_gg_ju].copy()
df_gg_ju.to_csv("gg_ju_interactions.csv", index=False)
print("(D) Interactions between GG and JU -> gg_ju_interactions.csv")
print("All analyses complete. CSV reports generated.")
if name == "main":
main()
I'm trying to embed a
requirements.txt
file with thepackage
variable but its showing an 'invalid requirement' error for the same. Please help. I want the packages to be installed via a text file, i.e.requirements.txt
and the package dependencies via the generatedadditional_requirements.txt
file.P.S. My code works when only one package is specified in the
requirements.txt
file but fails when more than one package name is entered.