Python Programs
NYCGasPrices.py
#------------------------------------------------------------------------#
Description: Visualization for NY GasPrices data
#------------------------------------------------------------------------#
import pandas as pd
import pandasql as psql
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
import re
df = pd.read_csv("NY_GasPrices.csv", skiprows=2)
df = df.iloc[:-1, :] #Last empty row is dropped
#Creating Year Column
year = []
for i in df['Date']:
year.append(int(i[4:8]))
df['Year'] = year
#df_new will only contain the Year column and Gas prices, from 2016 to present
q = 'SELECT Year, "New York City Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)" AS GasPrices FROM df WHERE Year >= 2016'
gas = psql.sqldf(q)
df_new = pd.DataFrame(gas)
#df_new will now be grouped by the year and the average price is computed for each year
q_2 = 'SELECT Year, AVG(GasPrices) as AvgGasPrices FROM df_new GROUP BY Year'
avgGas = psql.sqldf(q_2)
df_new = pd.DataFrame(avgGas)
print(df_new)
style.use('fivethirtyeight')
df_new = df_new.plot(x = "Year", y = "AvgGasPrices", lw = 3)
plt.ylabel('Gas Prices', fontsize = 12)
plt.xlabel('Year', fontsize = 12)
plt.title('Average Gas Prices 2016-2021', fontsize = 13, weight = 'bold', alpha = .75)
plt.tight_layout()
fig = plt.gcf()
fig.savefig("gasPrices.png")
plt.show()
NYCExpenditure.py
#------------------------------------------------------------------------#
Description: Visualization for NY Expenditure data
#------------------------------------------------------------------------#
import pandas as pd
import pandasql as psql
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
import re
df = pd.read_csv("expenditure_NYPCEPC.csv")
print(df)
pattern = r'[0-9]{4}'
colNames = []
for i in df['DATE']:
if(re.match(pattern, str(i))):
colNames.append("CY " + i[:4])
newDF = pd.DataFrame({
'Year' : colNames,
'Avg Expenditure' : df['NYPCEPC']
})
style.use('fivethirtyeight')
newDF = newDF.plot(x = "Year", y = "Avg Expenditure", lw = 3)
plt.ylabel('Annual Spending', fontsize = 12)
plt.xlabel('Year', fontsize = 12)
plt.title('Per Capita Personal Consumption Expenditures', fontsize = 13, weight = 'bold', alpha = .75)
plt.tight_layout()
fig = plt.gcf()
fig.savefig("Expenditure.png")
plt.show()
NYCWaste.py
#------------------------------------------------------------------------#
Description: Visualization for NYC Waste data
#------------------------------------------------------------------------#
import pandas as pd
import pandasql as psql
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
import re
df = pd.read_csv("NYC_Waste.csv")
pattern = r'CY +[0-9]{4}'
colNames = []
for i in df:
if(re.match(pattern, str(i))):
colNames.append(i)
totals = []
for j in colNames:
totals.append(float(df[j].iloc[-1]))
newDF = pd.DataFrame({'Year' : colNames,
'Waste GHG' : totals
})
print(newDF)
style.use('fivethirtyeight')
newDF = newDF.plot(x = "Year", y = "Waste GHG", lw = 2.5)
plt.ylabel('Waste GHG', fontsize = 12)
plt.xlabel('Year', fontsize = 12)
plt.title('Waste Greenhouse Gases Emissions', fontsize = 13, weight = 'bold', alpha = .75)
plt.tight_layout()
fig = plt.gcf()
fig.savefig("Waste.png")
plt.show()
NYCTransportation.py
#------------------------------------------------------------------------#
Description: Visualization for NYC Transportation data
#------------------------------------------------------------------------#
import pandas as pd
import pandasql as psql
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
import re
df = pd.read_csv("NYC_Transportation.csv")
df_filter = pd.read_csv("NYC_Transportation.csv", skiprows=2)
pattern = r'CY +[0-9]{4}'
colNames = []
for i in df:
if(re.match(pattern, str(i))):
colNames.append(i)
q = 'SELECT "CY 2016", "CY 2017", "CY 2018", "CY 2019" FROM df WHERE "CY 2016" not like "%[^0-9]%" and "CY 2016" != "tCO2e"'
numeric = psql.sqldf(q)
numericCols = pd.DataFrame(numeric)
filteredDF = pd.DataFrame({
"Category" : df_filter["(Category, Label)"],
"CY 2016" : numericCols["CY 2016"],
"CY 2017" : numericCols["CY 2017"],
"CY 2018" : numericCols["CY 2018"],
"CY 2019" : numericCols["CY 2019"],
})
#The data has been scraped now it is ready to be processed
q_2 = 'SELECT SUM("CY 2016") AS "CY 2016", SUM("CY 2017") AS "CY 2017", SUM("CY 2018") AS "CY 2018", SUM("CY 2019") AS "CY 2019" FROM filteredDF WHERE Category = "On-Road"'
roadSums = psql.sqldf(q_2)
roadSums = pd.DataFrame(roadSums)
q_3 = 'SELECT SUM("CY 2016") AS "CY 2016", SUM("CY 2017") AS "CY 2017", SUM("CY 2018") AS "CY 2018", SUM("CY 2019") AS "CY 2019" FROM filteredDF WHERE Category = "Railways"'
trainSums = psql.sqldf(q_3)
trainSums = pd.DataFrame(trainSums)
roadTotals = []
trainTotals = []
for t in colNames:
roadTotals.append(float(roadSums[t]))
trainTotals.append(float(trainSums[t]))
newDF = pd.DataFrame({'Year' : colNames,
'Road GHG' : roadTotals,
'Train GHG' : trainTotals
})
print(newDF)
fig,ax = plt.subplots()
ax.plot(newDF['Year'], newDF['Road GHG'], color = "red")
plt.ticklabel_format(style='plain', axis='y')
ax.set_xlabel("Year", fontsize=12)
ax.set_ylabel("Road GHG", color = "red", fontsize = 12)
#twin object for two different y axis
ax2 = ax.twinx()
ax2.plot(newDF['Year'], newDF['Train GHG'], color = "blue")
ax2.set_ylabel("Train GHG", color = "blue", fontsize=12)
plt.title('Road and Train Greenhouse Gases Emissions', fontsize = 13, weight = 'bold', alpha = .75)
plt.ticklabel_format(style='plain', axis='y')
plt.tight_layout()
fig.savefig('RoadandTrainEmissions.png',
format='png',
dpi=100,
bbox_inches='tight')
#Total of both road travel and train travel
q_4 = 'SELECT "CY 2016", "CY 2017", "CY 2018", "CY 2019" FROM filteredDF WHERE Category = "TOTALS"'
total = psql.sqldf(q_4)
total = pd.DataFrame(total)
totalArr = []
for g in colNames:
totalArr.append(float(total[g]))
total = pd.DataFrame({'Year' : colNames,
'Transportation GHG' : totalArr
})
print(total)
style.use('fivethirtyeight')
total = total.plot(x = "Year", y = "Transportation GHG", lw = 2.5)
plt.ylabel('Transportation GHG', fontsize = 12)
plt.xlabel('Year', fontsize = 12)
plt.ticklabel_format(style='plain', axis='y')
plt.title('Transportation Greenhouse Gases Emissions', fontsize = 13, weight = 'bold', alpha = .75)
plt.tight_layout()
fig = plt.gcf()
fig.savefig("Transportation.png")
NYCStationary.py
#------------------------------------------------------------------------#
Description: Visualization for NYC Stationary data
#------------------------------------------------------------------------#
import pandas as pd
import pandasql as psql
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
import re
df = pd.read_csv("NYC_Stationary.csv")
df_filter = pd.read_csv("NYC_Stationary.csv", skiprows=2)
pattern = r'CY +[0-9]{4}'
colNames = []
for i in df:
if(re.match(pattern, str(i))):
colNames.append(i)
q = 'SELECT "CY 2016", "CY 2017", "CY 2018", "CY 2019" FROM df WHERE "CY 2016" not like "%[^0-9]%" and "CY 2016" != "tCO2e"'
numeric = psql.sqldf(q)
numericCols = pd.DataFrame(numeric)
filteredDF = pd.DataFrame({
"Category" : df_filter["(Category, Label)"],
"CY 2016" : numericCols["CY 2016"],
"CY 2017" : numericCols["CY 2017"],
"CY 2018" : numericCols["CY 2018"],
"CY 2019" : numericCols["CY 2019"],
})
#The data has been scraped now it is ready to be processed
q_2 = 'SELECT SUM("CY 2016") AS "CY 2016", SUM("CY 2017") AS "CY 2017", SUM("CY 2018") AS "CY 2018", SUM("CY 2019") AS "CY 2019" FROM filteredDF WHERE Category = "Residential"'
residentialSums = psql.sqldf(q_2)
residentialSums = pd.DataFrame(residentialSums)
q_3 = 'SELECT SUM("CY 2016") AS "CY 2016", SUM("CY 2017") AS "CY 2017", SUM("CY 2018") AS "CY 2018", SUM("CY 2019") AS "CY 2019" FROM filteredDF WHERE Category = "Commercial and Institutional"'
commercialSums = psql.sqldf(q_3)
commercialSums = pd.DataFrame(commercialSums)
residentialTotals = []
commercialTotals = []
for s in colNames:
residentialTotals.append(float(residentialSums[s]))
commercialTotals.append(float(commercialSums[s]))
newDF = pd.DataFrame({'Year' : colNames,
'Residential GHG' : residentialTotals,
'Commercial GHG' : commercialTotals
})
print(newDF)
x = newDF["Year"]
y = newDF["Residential GHG"]
z = newDF["Commercial GHG"]
resi = plt.plot(x, y, label = "Residential GHG", color = "purple")
comm = plt.plot(x, z, label = "Commercial GHG", color = "orange")
plt.title('Residencial and Commercial GHG Emissions', fontsize = 13, weight = 'bold', alpha = .75)
plt.ticklabel_format(style='plain', axis='y')
plt.ylabel('GHG', fontsize = 12)
plt.xlabel('Year', fontsize = 12)
plt.legend()
plt.tight_layout()
fig = plt.gcf()
fig.savefig('ResidentialandCommercialEmissions.png')
#Total of both residential and commercial emissions
q_4 = 'SELECT "CY 2016", "CY 2017", "CY 2018", "CY 2019" FROM filteredDF WHERE Category = "TOTALS"'
total = psql.sqldf(q_4)
total = pd.DataFrame(total)
totalArr = []
for g in colNames:
totalArr.append(float(total[g]))
total = pd.DataFrame({'Year' : colNames,
'Stationary GHG' : totalArr
})
print(total)
style.use('fivethirtyeight')
total.plot(x = "Year", y = "Stationary GHG", lw = 2.5)
plt.ylabel('Stationary GHG', fontsize = 12)
plt.xlabel('Year', fontsize = 12)
plt.ticklabel_format(style='plain', axis='y')
plt.title('Stationary Greenhouse Gases Emissions', fontsize = 13, weight = 'bold', alpha = .75)
plt.tight_layout()
fig = plt.gcf()
fig.savefig("Stationary.png")
DataFramesCorr.py
#------------------------------------------------------------------------#
Description: After uniting our all dataframes and keeping data from 2016-2019,
we find the correlation between Gas prices and the rest of the pieces of data
and the correlation between Expenditure and the rest of the pieces of data.
#------------------------------------------------------------------------#
import pandas as pd
import pandasql as psql
import matplotlib.pyplot as plt
import numpy as np
import re
#------------------GAS------------------------------------------------------
df_gas = pd.read_csv("NY_GasPrices.csv", skiprows=2)
df_gas = df_gas.iloc[:-1, :] #Last empty row is dropped
#Creating Year Column
gas_year = []
for i in df_gas['Date']:
gas_year.append(int(i[4:8]))
df_gas['Year'] = gas_year
#df_new will only contain the Year column and Gas prices, from 2016 to present
q_gas_1 = 'SELECT Year, "New York City Regular All Formulations Retail Gasoline Prices (Dollars per Gallon)" AS GasPrices FROM df_gas WHERE Year >= 2016 and Year <= 2019'
gas = psql.sqldf(q_gas_1)
dfGas_Result = pd.DataFrame(gas)
#df_new will now be grouped by the year and the average price is computed for each year
q_2 = 'SELECT Year, AVG(GasPrices) as AvgGasPrices FROM dfGas_Result GROUP BY Year'
avgGas = psql.sqldf(q_2)
dfGas_Result = pd.DataFrame(avgGas)
#print(dfGas_Result)
#------------------Waste------------------------------------------------------
df_waste = pd.read_csv("NYC_Waste.csv")
pattern = r'CY +[0-9]{4}'
wasteNames = []
for i in df_waste:
if(re.match(pattern, str(i))):
wasteNames.append(i)
waste_totals = []
for j in wasteNames:
waste_totals.append(float(df_waste[j].iloc[-1]))
dfWaste_Result = pd.DataFrame({'Year' : wasteNames,
'Waste GHG' : waste_totals
})
#print(dfWaste_Result)
#------------------Transportation------------------------------------------------------
df_transportation = pd.read_csv("NYC_Transportation.csv")
df_transportation_filter = pd.read_csv("NYC_Transportation.csv", skiprows=2)
pattern = r'CY +[0-9]{4}'
transportationNames = []
for i in df_transportation:
if(re.match(pattern, str(i))):
transportationNames.append(i)
q = 'SELECT "CY 2016", "CY 2017", "CY 2018", "CY 2019" FROM df_transportation WHERE "CY 2016" not like "%[^0-9]%" and "CY 2016" != "tCO2e"'
transportation_numeric = psql.sqldf(q)
transportation_numericCols = pd.DataFrame(transportation_numeric)
filtered_transportationDF = pd.DataFrame({
"Category" : df_transportation_filter["(Category, Label)"],
"CY 2016" : transportation_numericCols["CY 2016"],
"CY 2017" : transportation_numericCols["CY 2017"],
"CY 2018" : transportation_numericCols["CY 2018"],
"CY 2019" : transportation_numericCols["CY 2019"],
})
q_transportation = 'SELECT "CY 2016", "CY 2017", "CY 2018", "CY 2019" FROM filtered_transportationDF WHERE Category = "TOTALS"'
dfTransportation_Result = psql.sqldf(q_transportation)
dfTransportation_Result = pd.DataFrame(dfTransportation_Result)
transportation_totalArr = []
for g in transportationNames:
transportation_totalArr.append(float(dfTransportation_Result[g]))
dfTransportation_Result = pd.DataFrame({'Year' : transportationNames,
'Transportation GHG' : transportation_totalArr
})
#print(dfTransportation_Result)
#------------------Stationary------------------------------------------------------
df_stationary = pd.read_csv("NYC_Stationary.csv")
df_stationary_filter = pd.read_csv("NYC_Stationary.csv", skiprows=2)
pattern = r'CY +[0-9]{4}'
stationaryNames = []
for i in df_stationary:
if(re.match(pattern, str(i))):
stationaryNames.append(i)
q = 'SELECT "CY 2016", "CY 2017", "CY 2018", "CY 2019" FROM df_stationary WHERE "CY 2016" not like "%[^0-9]%" and "CY 2016" != "tCO2e"'
stationary_numeric = psql.sqldf(q)
stationary_numericCols = pd.DataFrame(stationary_numeric)
filtered_stationaryDF = pd.DataFrame({
"Category" : df_stationary_filter["(Category, Label)"],
"CY 2016" : stationary_numericCols["CY 2016"],
"CY 2017" : stationary_numericCols["CY 2017"],
"CY 2018" : stationary_numericCols["CY 2018"],
"CY 2019" : stationary_numericCols["CY 2019"],
})
q_stationary = 'SELECT "CY 2016", "CY 2017", "CY 2018", "CY 2019" FROM filtered_stationaryDF WHERE Category = "TOTALS"'
dfStationary_Result = psql.sqldf(q_stationary)
dfStationary_Result = pd.DataFrame(dfStationary_Result)
stationary_totalArr = []
for g in stationaryNames:
stationary_totalArr.append(float(dfStationary_Result[g]))
dfStationary_Result = pd.DataFrame({'Year' : stationaryNames,
'Stationary GHG' : stationary_totalArr
})
#print(dfStationary_Result)
#------------------Expenditures------------------------------------------------------
df_exp = pd.read_csv("expenditure_NYPCEPC.csv")
pattern = r'[0-9]{4}'
expNames = []
for i in df_exp['DATE']:
if(re.match(pattern, str(i))):
expNames.append(i[:4])
dfExpenditure = pd.DataFrame({
'Year' : expNames,
'Avg Expenditure' : df_exp['NYPCEPC']
})
q_exp = 'SELECT Year, "Avg Expenditure" FROM dfExpenditure WHERE Year >= 2016 and Year <= 2019'
exp = psql.sqldf(q_exp)
dfExpenditure_Result = pd.DataFrame(exp)
#print(dfExpenditure_Result)
###------------------Uniting into one DataFrame------------------------------------------------------###
df_ALL = pd.DataFrame({
'GasPrices' : dfGas_Result['AvgGasPrices'],
'Waste' : dfWaste_Result['Waste GHG'],
'Transportation' : dfTransportation_Result['Transportation GHG'],
'Stationary' : dfStationary_Result['Stationary GHG'],
'Expenditures' : dfExpenditure_Result['Avg Expenditure']
})
colList = []
for i in df_ALL:
colList.append(i)
###------------------Finding Correlations------------------------------------------------------###
gasCor = []
for j in colList:
if(df_ALL['GasPrices'].astype(float).corr(df_ALL[j] != 1.0)):
abs_temp = (df_ALL['GasPrices'].astype(float).corr(df_ALL[j]).astype(float))
gas_pair = (j, abs_temp)
gasCor.append(gas_pair)
expCor = []
for k in colList:
if(df_ALL['Expenditures'].astype(float).corr(df_ALL[k] != 1.0)):
abs_temp = (df_ALL['Expenditures'].astype(float).corr(df_ALL[k]).astype(float))
exp_pair = (k, abs_temp)
expCor.append(exp_pair)
print("Correlations between Gas Prices and Emissions\n")
for cors in gasCor:
print(cors)
print("\n")
print("Correlations between Expenditures and Emissions\n")
for cors in expCor:
print(cors)