Skip to content

Instantly share code, notes, and snippets.

@kperry2215
Created July 4, 2019 22:47
Show Gist options
  • Save kperry2215/57ae6fa1bc1ccdd4757252127573eb1b to your computer and use it in GitHub Desktop.
Save kperry2215/57ae6fa1bc1ccdd4757252127573eb1b to your computer and use it in GitHub Desktop.
import pandas as pd
from scipy.optimize import curve_fit
import numpy as np
import matplotlib.pyplot as plt
def read_in_csv(file_path):
"""
Read in the specified csv as a pandas dataframe
Arguments:
file_path: String. Path for the csv file that we want to read in
Outputs:
dataframe: Pandas dataframe.
"""
dataframe=pd.read_csv(file_path)
return dataframe
def hyperbolic_equation(t, qi, b, di):
"""
Hyperbolic decline curve equation
Arguments:
t: Float. Time since the well first came online, can be in various units
(days, months, etc) so long as they are consistent.
qi: Float. Initial production rate when well first came online.
b: Float. Hyperbolic decline constant
di: Float. Nominal decline rate at time t=0
Output:
Returns q, or the expected production rate at time t. Float.
"""
return qi/((1.0+b*di*t)**(1.0/b))
def exponential_equation(t, qi, di):
"""
Exponential decline curve equation
Arguments:
t: Float. Time since the well first came online, can be in various units
(days, months, etc) so long as they are consistent.
qi: Float. Initial production rate when well first came online.
di: Float. Nominal decline rate (constant)
Output:
Returns q, or the expected production rate at time t. Float.
"""
return qi*np.exp(-di*t)
def remove_nan_and_zeroes_from_columns(df, variable):
"""
This function cleans up a dataframe by removing rows in a specific
column that are null/NaN or equal to 0. This basically removes zero
production time periods.
Arguments:
df: Pandas dataframe.
variable: String. Name of the column where we want to filter out
NaN's or 0 values
Output:
filtered_df: Pandas dataframe. Dataframe with NaN's and zeroes filtered out of
the specified column
"""
filtered_df = df[(df[variable].notnull()) & (df[variable]>0)]
return filtered_df
def generate_time_delta_column(df, time_column, date_first_online_column):
"""
Create column for the time that a well has been online at each reading, with
the first non-null month in the series listed as the start of production
Arguments:
df: Pandas dataframe
time_column: String. Name of the column that includes the specific record date
that the data was taken at. Column type is pandas datetime
date_first_online_column: Name of the column that includes the date that the
well came online. Column type is pandas datetime
Outputs:
Pandas series containing the difference in days between the date the well
came online and the date that the data was recorded (cumulative days online)
"""
return (df[time_column]-df[date_first_online_column]).dt.days
def get_min_or_max_value_in_column_by_group(dataframe, group_by_column, calc_column, calc_type):
"""
This function obtains the min or max value for a column, with a group by applied. For example,
it could return the earliest (min) RecordDate for each API number in a dataframe
Arguments:
dataframe: Pandas dataframe
group_by_column: string. Name of column that we want to apply a group by to
calc_column: string. Name of the column that we want to get the aggregated max or min for
calc_type: string; can be either 'min' or 'max'. Defined if we want to pull the min value
or the max value for the aggregated column
Outputs:
value: Depends on the calc_column type.
"""
value=dataframe.groupby(group_by_column)[calc_column].transform(calc_type)
return value
def get_max_initial_production(df, number_first_months, variable_column, date_column):
"""
This function allows you to look at the first X months of production, and selects
the highest production month as max initial production
Arguments:
df: Pandas dataframe.
number_first_months: float. Number of months from the point the well comes online
to compare to get the max initial production rate qi (this looks at multiple months
in case there is a production ramp-up)
variable_column: String. Column name for the column where we're attempting to get
the max volume from (can be either 'Gas' or 'Oil' in this script)
date_column: String. Column name for the date that the data was taken at
"""
#First, sort the data frame from earliest to most recent prod date
df=df.sort_values(by=date_column)
#Pull out the first x months of production, where number_first_months is x
df_beginning_production=df.head(number_first_months)
#Return the max value in the selected variable column from the newly created
#df_beginning_production df
return df_beginning_production[variable_column].max()
def plot_actual_vs_predicted_by_equations(df, x_variable, y_variables, plot_title):
"""
This function is used to map x- and y-variables against each other
Arguments:
df: Pandas dataframe.
x_variable: String. Name of the column that we want to set as the
x-variable in the plot
y_variables: string (single), or list of strings (multiple). Name(s)
of the column(s) that we want to set as the y-variable in the plot
"""
#Plot results
df.plot(x=x_variable, y=y_variables, title=plot_title)
plt.show()
def main():
#Read in the monthly oil and gas data
file_path='master_dataframe_production.csv'
bakken_data=read_in_csv(file_path)
#Perform some data cleaning to get the columns as the right data type
bakken_data['ReportDate']=pd.to_datetime(bakken_data['ReportDate'])
#Declare the desired product that we want to curve fit for--it can either by 'Gas' or 'Oil'
desired_product_type='Oil'
#Remove all rows with null values in the desired time series column
bakken_data=remove_nan_and_zeroes_from_columns(bakken_data, desired_product_type)
#Get the earliest RecordDate for each API Number
bakken_data['Online_Date']= get_min_or_max_value_in_column_by_group(bakken_data, group_by_column='API_WELLNO',
calc_column='ReportDate', calc_type='min')
#Generate column for time online delta
bakken_data['Days_Online']=generate_time_delta_column(bakken_data, time_column='ReportDate',
date_first_online_column='Online_Date')
#Pull data that came online between January and June 2016
bakken_data_2016=bakken_data[(bakken_data.Online_Date>='2016-01-01') & (bakken_data.Online_Date<='2016-06-01')]
#Get a list of unique API's to loop through--these were randomly selected as examples
unique_well_APIs_list=[33023013930000.0, 33105039980000.0, 33105039970000.0,
33013018230000.0, 33013018220000.0]
#Loop through each API, and perform calculations
for api_number in unique_well_APIs_list:
#Subset the dataframe by API Number
production_time_series=bakken_data_2016[bakken_data_2016.API_WELLNO==api_number]
#Get the highest value of production in the first three months of production, to use as qi value
qi=get_max_initial_production(production_time_series, 3, desired_product_type, 'ReportDate')
#Exponential curve fit the data to get best fit equation
popt_exp, pcov_exp=curve_fit(exponential_equation, production_time_series['Days_Online'],
production_time_series[desired_product_type],bounds=(0, [qi,20]))
print('Exponential Fit Curve-fitted Variables: qi='+str(popt_exp[0])+', di='+str(popt_exp[1]))
#Hyperbolic curve fit the data to get best fit equation
popt_hyp, pcov_hyp=curve_fit(hyperbolic_equation, production_time_series['Days_Online'],
production_time_series[desired_product_type],bounds=(0, [qi,2,20]))
print('Hyperbolic Fit Curve-fitted Variables: qi='+str(popt_hyp[0])+', b='+str(popt_hyp[1])+', di='+str(popt_hyp[2]))
#Exponential fit results
production_time_series.loc[:,'Exponential_Predicted']=exponential_equation(production_time_series['Days_Online'],
*popt_exp)
#Hyperbolic fit results
production_time_series.loc[:,'Hyperbolic_Predicted']=hyperbolic_equation(production_time_series['Days_Online'],
*popt_hyp)
#Declare the x- and y- variables that we want to plot against each other
y_variables=[desired_product_type, "Hyperbolic_Predicted", "Exponential_Predicted"]
x_variable='Days_Online'
#Create the plot title
plot_title=desired_product_type+' Production for Well API '+str(api_number)
#Plot the data to visualize the equation fit
plot_actual_vs_predicted_by_equations(production_time_series, x_variable, y_variables, plot_title)
if __name__== "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment