bjulius · July 12, 2023 03:50 · bjulius · Jul 12, 2023
diff --git a/gistfile1.txt b/gistfile1.txt
 PYTHON CODE
 ***********

 import pandas as pd

 # Load the data
 data = pd.read_csv('/mnt/data/Matthias Challenge.csv')

 # Convert 'Date' to datetime format
 data['Date'] = pd.to_datetime(data['Date'])

 # Remove commas from 'Unit Value' and 'Value Total' and convert to numeric
 data['Unit Value'] = data['Unit Value'].str.replace(',', '').astype(float)
 data['Value Total'] = data['Value Total'].str.replace(',', '').astype(float)

 # Separate sales and returns
 sales = data[data['Qty'] > 0].sort_values(by='Date')
 returns = data[data['Qty'] < 0].sort_values(by='Date')

 # Create a list to store the IDs of sales to be eliminated
 sales_to_eliminate = []

 # For each return, find the corresponding sale
 for _, return_row in returns.iterrows():
    product = return_row['ID_Product']
    qty = abs(return_row['Qty'])
    return_date = return_row['Date']
    
    # Find sales of the same product and quantity before the return date
    matching_sales = sales[(sales['ID_Product'] == product) & (sales['Qty'] == qty) & (sales['Date'] < return_date)]
    
    # If there are matching sales, take the one with the latest date (nearest to the return)
    if not matching_sales.empty:
        sale_to_eliminate = matching_sales.iloc[-1]
        sales_to_eliminate.append(sale_to_eliminate['ID_Sales'])
        
        # Remove this sale from the sales dataframe to avoid using it for another return
        sales = sales[sales['ID_Sales'] != sale_to_eliminate['ID_Sales']]

 # Create a list of all IDs to eliminate (returns and their corresponding sales)
 ids_to_eliminate = list(returns['ID_Sales']) + sales_to_eliminate

 # Eliminate these IDs from the original dataframe
 final_data = data[~data['ID_Sales'].isin(ids_to_eliminate)]

 final_data

 PQ/M CODE
 **********
 let
    // Load the data
    Source = Csv.Document(File.Contents("/mnt/data/Matthias Challenge.csv"),[Delimiter=",", Columns=7, Encoding=1252, QuoteStyle=QuoteStyle.None]),
    #"Promoted Headers" = Table.PromoteHeaders(Source, [PromoteAllScalars=true]),
    #"Changed Type" = Table.TransformColumnTypes(#"Promoted Headers",{{"ID_Sales", Int64.Type}, {"Date", type date}, {"ID_Product", Int64.Type}, {"Product", type text}, {"Qty", Int64.Type}, {"Unit Value", Currency.Type}, {"Value Total", Currency.Type}}),
    // Separate sales and returns
    Sales = Table.SelectRows(#"Changed Type", each ([Qty] > 0)),
    Returns = Table.SelectRows(#"Changed Type", each ([Qty] < 0)),
    // Sort sales and returns by date
    SortedSales = Table.Sort(Sales,{{"Date", Order.Ascending}}),
    SortedReturns = Table.Sort(Returns,{{"Date", Order.Ascending}}),
    // Join sales and returns on product and quantity
    Joined = Table.NestedJoin(SortedReturns,{"ID_Product", "Qty"},SortedSales,{"ID_Product", "Qty"},"NewColumn",JoinKind.Inner),
    // Filter out sales that occurred after the return
    Filtered = Table.SelectRows(Joined, each [Date] > [NewColumn.Date]),
    // Group by sales ID and keep the one with the latest date
    Grouped = Table.Group(Filtered, {"ID_Sales"}, {{"LatestDate", each List.Max([Date]), type date}}),
    // Remove returns and their corresponding sales from the original data
    Final = Table.SelectRows(#"Changed Type", each ([ID_Sales] <> Grouped[ID_Sales]))
 in
    Final

 R CODE
 *******

 # Load necessary libraries
 library(dplyr)
 library(lubridate)

 # Load the data
 data <- read.csv("/mnt/data/Matthias Challenge.csv")

 # Convert 'Date' to date format
 data$Date <- mdy(data$Date)

 # Separate sales and returns
 sales <- filter(data, Qty > 0) %>% arrange(Date)
 returns <- filter(data, Qty < 0) %>% arrange(Date)

 # Initialize an empty vector to store the IDs of sales to be eliminated
 sales_to_eliminate <- c()

 # For each return, find the corresponding sale
 for (i in 1:nrow(returns)) {
  # Find sales of the same product and quantity before the return date
  matching_sales <- filter(sales, ID_Product == returns$ID_Product[i] & Qty == abs(returns$Qty[i]) & Date < returns$Date[i])
  
  # If there are matching sales, take the one with the latest date (nearest to the return)
  if (nrow(matching_sales) > 0) {
    sale_to_eliminate <- tail(matching_sales, n=1)$ID_Sales
    sales_to_eliminate <- c(sales_to_eliminate, sale_to_eliminate)
    
    # Remove this sale from the sales dataframe to avoid using it for another return
    sales <- filter(sales, ID_Sales != sale_to_eliminate)
  }
 }

 # Create a vector of all IDs to eliminate (returns and their corresponding sales)
 ids_to_eliminate <- c(returns$ID_Sales, sales_to_eliminate)

 # Eliminate these IDs from the original dataframe
 final_data <- filter(data, !(ID_Sales %in% ids_to_eliminate))

 final_data
	PYTHON CODE
	***********

	import pandas as pd

	# Load the data
	data = pd.read_csv('/mnt/data/Matthias Challenge.csv')

	# Convert 'Date' to datetime format
	data['Date'] = pd.to_datetime(data['Date'])

	# Remove commas from 'Unit Value' and 'Value Total' and convert to numeric
	data['Unit Value'] = data['Unit Value'].str.replace(',', '').astype(float)
	data['Value Total'] = data['Value Total'].str.replace(',', '').astype(float)

	# Separate sales and returns
	sales = data[data['Qty'] > 0].sort_values(by='Date')
	returns = data[data['Qty'] < 0].sort_values(by='Date')

	# Create a list to store the IDs of sales to be eliminated
	sales_to_eliminate = []

	# For each return, find the corresponding sale
	for _, return_row in returns.iterrows():
	product = return_row['ID_Product']
	qty = abs(return_row['Qty'])
	return_date = return_row['Date']

	# Find sales of the same product and quantity before the return date
	matching_sales = sales[(sales['ID_Product'] == product) & (sales['Qty'] == qty) & (sales['Date'] < return_date)]

	# If there are matching sales, take the one with the latest date (nearest to the return)
	if not matching_sales.empty:
	sale_to_eliminate = matching_sales.iloc[-1]
	sales_to_eliminate.append(sale_to_eliminate['ID_Sales'])

	# Remove this sale from the sales dataframe to avoid using it for another return
	sales = sales[sales['ID_Sales'] != sale_to_eliminate['ID_Sales']]

	# Create a list of all IDs to eliminate (returns and their corresponding sales)
	ids_to_eliminate = list(returns['ID_Sales']) + sales_to_eliminate

	# Eliminate these IDs from the original dataframe
	final_data = data[~data['ID_Sales'].isin(ids_to_eliminate)]

	final_data

	PQ/M CODE
	**********
	let
	// Load the data
	Source = Csv.Document(File.Contents("/mnt/data/Matthias Challenge.csv"),[Delimiter=",", Columns=7, Encoding=1252, QuoteStyle=QuoteStyle.None]),
	#"Promoted Headers" = Table.PromoteHeaders(Source, [PromoteAllScalars=true]),
	#"Changed Type" = Table.TransformColumnTypes(#"Promoted Headers",{{"ID_Sales", Int64.Type}, {"Date", type date}, {"ID_Product", Int64.Type}, {"Product", type text}, {"Qty", Int64.Type}, {"Unit Value", Currency.Type}, {"Value Total", Currency.Type}}),
	// Separate sales and returns
	Sales = Table.SelectRows(#"Changed Type", each ([Qty] > 0)),
	Returns = Table.SelectRows(#"Changed Type", each ([Qty] < 0)),
	// Sort sales and returns by date
	SortedSales = Table.Sort(Sales,{{"Date", Order.Ascending}}),
	SortedReturns = Table.Sort(Returns,{{"Date", Order.Ascending}}),
	// Join sales and returns on product and quantity
	Joined = Table.NestedJoin(SortedReturns,{"ID_Product", "Qty"},SortedSales,{"ID_Product", "Qty"},"NewColumn",JoinKind.Inner),
	// Filter out sales that occurred after the return
	Filtered = Table.SelectRows(Joined, each [Date] > [NewColumn.Date]),
	// Group by sales ID and keep the one with the latest date
	Grouped = Table.Group(Filtered, {"ID_Sales"}, {{"LatestDate", each List.Max([Date]), type date}}),
	// Remove returns and their corresponding sales from the original data
	Final = Table.SelectRows(#"Changed Type", each ([ID_Sales] <> Grouped[ID_Sales]))
	in
	Final

	R CODE
	*******

	# Load necessary libraries
	library(dplyr)
	library(lubridate)

	# Load the data
	data <- read.csv("/mnt/data/Matthias Challenge.csv")

	# Convert 'Date' to date format
	data$Date <- mdy(data$Date)

	# Separate sales and returns
	sales <- filter(data, Qty > 0) %>% arrange(Date)
	returns <- filter(data, Qty < 0) %>% arrange(Date)

	# Initialize an empty vector to store the IDs of sales to be eliminated
	sales_to_eliminate <- c()

	# For each return, find the corresponding sale
	for (i in 1:nrow(returns)) {
	# Find sales of the same product and quantity before the return date
	matching_sales <- filter(sales, ID_Product == returns$ID_Product[i] & Qty == abs(returns$Qty[i]) & Date < returns$Date[i])

	# If there are matching sales, take the one with the latest date (nearest to the return)
	if (nrow(matching_sales) > 0) {
	sale_to_eliminate <- tail(matching_sales, n=1)$ID_Sales
	sales_to_eliminate <- c(sales_to_eliminate, sale_to_eliminate)

	# Remove this sale from the sales dataframe to avoid using it for another return
	sales <- filter(sales, ID_Sales != sale_to_eliminate)
	}
	}

	# Create a vector of all IDs to eliminate (returns and their corresponding sales)
	ids_to_eliminate <- c(returns$ID_Sales, sales_to_eliminate)

	# Eliminate these IDs from the original dataframe
	final_data <- filter(data, !(ID_Sales %in% ids_to_eliminate))

	final_data