Skip to content

Instantly share code, notes, and snippets.

View sjtalkar's full-sized avatar
💭
Immersion training in Python and Power BI for DataScience

Simi Talkar sjtalkar

💭
Immersion training in Python and Power BI for DataScience
  • Mondelez Intl.
  • Bellevue, WA
View GitHub Profile
from sklearn.feature_extraction.text import CountVectorizer
# normalize to lower case
df_lower = covid1_df['covid_prepare'].str.lower().to_frame()
df_lower['covid_prepare'].values
#Convert all including nan to strings
doc = df_lower['covid_prepare'].map(str).values
#delete nans
doc = np.delete(doc, np.where(doc == 'nan'))
@sjtalkar
sjtalkar / gist:11b3ca21c440b2508f7d5fa53f2f112a
Last active December 20, 2020 13:23
Cleaning the rental dataframe and identifying property types
def createPropertyTypeCol(rental_df):
"""This function adds a column called property_type_class to the dataframe
Args:
rental_df ([type]): [The dataframe has a column called property_type]
Returns:
[type]: [Dataframe with a more concise list of property types]
"""
# Property types Private room and Shared Room identified
def createRentalMap(df, fixed_radius, nbd_or_grp):
# Initialize a Folium map. Center it to the mean of latitude and longitude of the entire dataset
this_map = folium.Map(
location=[df["latitude"].mean(), df["longitude"].mean()],
tiles="CartoDB positron",
zoom_start=13,
)
def plotDot(point_row, fixed_radius, nbd_or_grp):
import pandas as pd
import datetime
# The historical data setup
assets = ['Automobiles', 'Building'] * 2
countries = ['Andorra', 'Andorra', 'Brunei', 'Brunei']
rates = [12.5, 4.5, 14.0, 6.5]
effective_from = ['2019-01-01'] * 4
effective_until = [None] * 4
depreciation_rate_df = pd.DataFrame(list(zip(assets, countries, rates, effective_from, effective_until)),
@sjtalkar
sjtalkar / gist:bd50792681ebd73625328683f06fa132
Last active October 25, 2020 00:30
Using LEAD window function in an update and merge
--Oracle Merge
merge into depreciation_rate d
using
(select asset,
country,
effective_from,
lead(effective_from) over (partition by asset, country order by effective_from) - 1/86400 as effective_until
from depreciation_rate) n
on (d.asset = n.asset
and d.country = n.country
@sjtalkar
sjtalkar / gist:62279587561b62e99d24a109ea11bb63
Last active October 18, 2020 23:23
Usage of Transform in Pandas
#Create a GroupBy object
df_groups = df.groupby(['Destination City Name', 'Origin State', 'Origin City', 'Unique Carrier'])
new_df = df_groups.agg(Total_Flights = ('Flight Date','count'))
new_df['Count Delayed Flights'] = df_groups.apply(lambda x : x[x['Arrival Delay'] > 0]['Arrival Delay'].count() )
new_df['Delayed Per Origin'] = new_df.groupby(['Destination City Name', 'Origin State', 'Origin City'])['Count Delayed Flights'].transform('sum')
@sjtalkar
sjtalkar / gist:c50d51c8d46bd7d72170a9b612d3e906
Last active October 18, 2020 15:36
Analytical function window
with cf as (select
unique_carrier,
origin_city,
origin_state,
destination_city_name,
destination_state,
sum (case when arrival_delay > 0 then 1 else 0 end) as count_delayed_flights,
count(arrival_delay) as total_flight_count
from "ArrivalInfo"
group by
@sjtalkar
sjtalkar / gist:21681a8f3b2b1e650c195d48206e54ff
Last active October 19, 2020 00:36
UsingPandasGroupByInPowerBI
# 'dataset' holds the input data for this script
import pandas as pd
df_groups = dataset.groupby(['Destination City Name', 'Origin State', 'Origin City', 'Unique Carrier'])
final_df = df_groups.agg(Total_Flights = ('Flight Date','count'))
final_df['Count of Delayed Flights'] = df_groups.apply(lambda x : x[x['Arrival Delay'] > 0]['Arrival Delay'].count() )
final_df = final_df.reset_index()
@sjtalkar
sjtalkar / gist:4fcb3ac912c525d6637493a0481d1981
Created October 11, 2020 19:15
Python code in Power BI
#'MetAreaPopAndTeams_df' and 'NHLData_df' hold the input data for this script
import pandas as pd
MetAreaPopAndTeams_df['Team Name Map'] = MetAreaPopAndTeams_df['NHL Teams'].apply(lambda teamname: NHLData_df[NHLData_df['Team'].str.contains(teamname)]['Team'].any(0))
@sjtalkar
sjtalkar / gist:f4634f4e54c70c60b35ecf794b18b91f
Last active October 9, 2020 15:22
SQL for partial string comparison
--Option 1
select metropolitanarea,
nhl_team,
team,
win,
loss
from cities_expanded c,
nhl_data n
where n.team like '%' || nhl_team || '%';