import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
A data frame with 10000 observations on the following 12 variables.
ID Identification
Income Income in $10,000's
Limit Credit limit
Rating Credit rating
Cards Number of credit cards
Age Age in years
Education Number of years of education
Gender A factor with levels Male and Female
Student A factor with levels No and Yes indicating whether the individual was a student
Married A factor with levels No and Yes indicating whether the individual was married
Ethnicity A factor with levels African American, Asian, and Caucasian indicating the individual's ethnicity
Balance Average credit card balance in $.
df_credit = pd.read_table(filepath_or_buffer='Data/credit.csv', sep=',')
df_credit.head(n=10)
X_credit = df_credit.iloc[:, [1,2,3,4,5,6,11]]
print(X_credit.shape)
scaler = StandardScaler()
scaler.fit(X_credit.astype('float'))
X_credit_scaled = scaler.transform(X_credit.astype('float'))
pd.DataFrame(X_credit_scaled).describe()
pca_credit = PCA(n_components=7)
Z_credit = pca_credit.fit_transform(X_credit_scaled)
pc_credit = np.round(pca_credit.components_, 2)
pc_credit_df = pd.DataFrame(pc_credit, columns=X_credit.columns).transpose()
pc_credit_df
plt.figure(figsize = [8,6])
sns.heatmap(pc_credit_df, cmap = 'RdBu')
plt.show()
print(np.round(pca_credit.explained_variance_ratio_,2))
Statistics for a large number of US Colleges from the 1995 issue of US News and World Report. Contains 777 observations on the following 18 variables.
Private A factor with levels No and Yes indicating private or public university
Apps Number of applications received
Accept Number of applications accepted
Enroll Number of new students enrolled
Top10perc Pct. new students from top 10% of H.S. class
Top25perc Pct. new students from top 25% of H.S. class
F.Undergrad Number of fulltime undergraduates
P.Undergrad Number of parttime undergraduates
Outstate Out-of-state tuition
Room.Board Room and board costs
Books Estimated book costs
Personal Estimated personal spending
PhD Pct. of faculty with Ph.D.'s
Terminal Pct. of faculty with terminal degree
S.F.Ratio Student/faculty ratio
perc.alumni Pct. alumni who donate
Expend Instructional expenditure per student
Grad.Rate Graduation rate
df_colleges = pd.read_table(filepath_or_buffer='Data/college.csv', sep=',')
df_colleges.head(n=10)
X_colleges = df_colleges.iloc[:, 1:]
print(X_colleges.shape)
scaler = StandardScaler()
scaler.fit(X_colleges.astype('float'))
X_colleges_scaled = scaler.transform(X_colleges.astype('float'))
pd.DataFrame(X_colleges_scaled).describe()
pca_colleges = PCA(n_components=17)
Z_colleges = pca_colleges.fit_transform(X_colleges_scaled)
pc_colleges = np.round(pca_colleges.components_,2)
pc_colleges_df = pd.DataFrame(pc_colleges, columns=X_colleges.columns).transpose()
pc_colleges_df
#df = pd.DataFrame(np.random.random((5,5)), columns=["a","b","c","d","e"])
plt.figure(figsize = [10,8])
sns.heatmap(pc_colleges_df, cmap = 'RdBu')
plt.show()
print(np.round(pca_colleges.explained_variance_ratio_,2))
#print(df_colleges.loc['University of Missouri at Rolla',:])
print(df_colleges.loc['Harvard University',:])
df_colleges.index.get_loc('Harvard University')
print(Z_colleges[250,:])
The data is taken from the Places Rated Almanac, by Richard Boyer and David Savageau, copyrighted and published by Rand McNally. The nine rating criteria used by Places Rated Almanac are:
Climate & Terrain
Housing
Health Care & Environment
Crime
Transportation
Education
The Arts
Recreation
Economics
For all but two of the above criteria, the higher the score, the better. For Housing and Crime, the lower the score the better.
df_places = pd.read_table(filepath_or_buffer='Data/places.txt', sep='\t')
df_places.set_index('City', inplace=True)
df_places.index.name = None
df_places.head(n=10)
X_places = df_places.iloc[:,[0,1,2,3,4,5,6,7,8]]
print(X_places.shape)
scaler = StandardScaler()
scaler.fit(X_places.astype('float'))
X_places_scaled = scaler.transform(X_places.astype('float'))
pd.DataFrame(X_places_scaled).describe()
pca_places = PCA(n_components=9)
Z_places = pca_places.fit_transform(X_places_scaled)
pc_places = np.round(pca_places.components_,3)
pc_places_df = pd.DataFrame(pc_places, columns=X_places.columns).transpose()
pc_places_df
plt.figure(figsize = [8,6])
sns.heatmap(pc_places_df, cmap = 'RdBu')
plt.show()
print(np.round(pca_places.explained_variance_ratio_,2))
df_places.index.get_loc('St.-Louis,MO-IL')
print(Z_places[261,:])