Skip to content

Instantly share code, notes, and snippets.

View sjtalkar's full-sized avatar
💭
Immersion training in Python and Power BI for DataScience

Simi Talkar sjtalkar

💭
Immersion training in Python and Power BI for DataScience
  • Mondelez Intl.
  • Bellevue, WA
View GitHub Profile
@sjtalkar
sjtalkar / gist:07a95af93aa5e92848bcda10e92b7291
Last active October 11, 2020 21:03
Clean and split delimited column
cities_df = cities_df.replace('(\[note\s\d{1,10}\])', '', regex=True)
cities_df = cities_df.replace('—[\s]?', np.NaN, regex=True)
cities_df= cities_df.applymap(lambda x: np.NaN if x == '' else x)
df.dropna(inplace=True)
#Step: create '+' separated list of teams and then split it
single_teams_df = cities_df['nhl'].str.replace( r"(?<!\s)(?=[A-Z])", '+', regex=True)
single_teams_df= single_teams_df.str.lstrip('+')
single_teams_df= single_teams_df.str.split('+')
cities_df['Team Name Map'] =cities_df['NHL'].apply(lambda teamname: nhl_df[nhl_df['team'].str.contains(teamname)]['team'].item())
@sjtalkar
sjtalkar / gist:f4634f4e54c70c60b35ecf794b18b91f
Last active October 9, 2020 15:22
SQL for partial string comparison
--Option 1
select metropolitanarea,
nhl_team,
team,
win,
loss
from cities_expanded c,
nhl_data n
where n.team like '%' || nhl_team || '%';
@sjtalkar
sjtalkar / gist:4fcb3ac912c525d6637493a0481d1981
Created October 11, 2020 19:15
Python code in Power BI
#'MetAreaPopAndTeams_df' and 'NHLData_df' hold the input data for this script
import pandas as pd
MetAreaPopAndTeams_df['Team Name Map'] = MetAreaPopAndTeams_df['NHL Teams'].apply(lambda teamname: NHLData_df[NHLData_df['Team'].str.contains(teamname)]['Team'].any(0))
@sjtalkar
sjtalkar / gist:21681a8f3b2b1e650c195d48206e54ff
Last active October 19, 2020 00:36
UsingPandasGroupByInPowerBI
# 'dataset' holds the input data for this script
import pandas as pd
df_groups = dataset.groupby(['Destination City Name', 'Origin State', 'Origin City', 'Unique Carrier'])
final_df = df_groups.agg(Total_Flights = ('Flight Date','count'))
final_df['Count of Delayed Flights'] = df_groups.apply(lambda x : x[x['Arrival Delay'] > 0]['Arrival Delay'].count() )
final_df = final_df.reset_index()
@sjtalkar
sjtalkar / gist:c50d51c8d46bd7d72170a9b612d3e906
Last active October 18, 2020 15:36
Analytical function window
with cf as (select
unique_carrier,
origin_city,
origin_state,
destination_city_name,
destination_state,
sum (case when arrival_delay > 0 then 1 else 0 end) as count_delayed_flights,
count(arrival_delay) as total_flight_count
from "ArrivalInfo"
group by
@sjtalkar
sjtalkar / gist:62279587561b62e99d24a109ea11bb63
Last active October 18, 2020 23:23
Usage of Transform in Pandas
#Create a GroupBy object
df_groups = df.groupby(['Destination City Name', 'Origin State', 'Origin City', 'Unique Carrier'])
new_df = df_groups.agg(Total_Flights = ('Flight Date','count'))
new_df['Count Delayed Flights'] = df_groups.apply(lambda x : x[x['Arrival Delay'] > 0]['Arrival Delay'].count() )
new_df['Delayed Per Origin'] = new_df.groupby(['Destination City Name', 'Origin State', 'Origin City'])['Count Delayed Flights'].transform('sum')
@sjtalkar
sjtalkar / gist:bd50792681ebd73625328683f06fa132
Last active October 25, 2020 00:30
Using LEAD window function in an update and merge
--Oracle Merge
merge into depreciation_rate d
using
(select asset,
country,
effective_from,
lead(effective_from) over (partition by asset, country order by effective_from) - 1/86400 as effective_until
from depreciation_rate) n
on (d.asset = n.asset
and d.country = n.country
import pandas as pd
import datetime
# The historical data setup
assets = ['Automobiles', 'Building'] * 2
countries = ['Andorra', 'Andorra', 'Brunei', 'Brunei']
rates = [12.5, 4.5, 14.0, 6.5]
effective_from = ['2019-01-01'] * 4
effective_until = [None] * 4
depreciation_rate_df = pd.DataFrame(list(zip(assets, countries, rates, effective_from, effective_until)),
def createRentalMap(df, fixed_radius, nbd_or_grp):
# Initialize a Folium map. Center it to the mean of latitude and longitude of the entire dataset
this_map = folium.Map(
location=[df["latitude"].mean(), df["longitude"].mean()],
tiles="CartoDB positron",
zoom_start=13,
)
def plotDot(point_row, fixed_radius, nbd_or_grp):