naga63 · November 14, 2018 12:38
diff --git a/Muscle Memory for Data Science with Python_01_ZhenLiu.py b/Muscle Memory for Data Science with Python_01_ZhenLiu.py
 #  Build Muscle Memory for Data Science with Python [Part 1] #

 ###############################
 # 0. Read, View and Save data #
 ###############################

 # First, load the libraries for our exercise
 # Load libraries #
 import pandas as pd
 import numpy as np

 # Downloaded the data from Zillow.
 file_dir = "https://raw.githubusercontent.com/zhendata/Medium_Posts/master/City_Zhvi_1bedroom_2018_05.csv"

 # read csv file into a Pandas dataframe
 raw_df = pd.read_csv(file_dir)

 # check first 5 rows of the file
 raw_df.head(5)

 # Save csv:
 # use raw_df.tail(5) to see last 5 rows of the file

 # saving a file is dataframe.to_csv(). 
 # If you don't want to index number to be saved, use dataframe.to_csv( index = False )

 ########################################
 # 1 . Table's Dimension and Data Types #
 ########################################

 ## 1.1 Dimension ##

 raw_df.shape
 # the results is a vector: (# of rows, # of cols)

 # Get the number of rows
 print(raw_df.shape[0])
 # column is raw_df.shape[1]

 ## 1.2 Data Types ##

 # What are the data types of your data, and how many columns are numeric?
 # Check the data types of the entire table's columns 
 raw_df.dtypes

 # Check the data type of a specific column
 raw_df['RegionID'].dtypes
 # result: dtype('int64')

 ################################
 # 2. Basic Column Manipulation #
 ################################

 ## 2.1 Subset data by columns ##

 ### a.Select columns by data types: ###
 # if you only want to include columns of float data
 raw_df.select_dtypes(include=['float64'])
 # Or to get numerical columns by excluding objects (non-numeric)
 raw_df.select_dtypes(exclude=['object'])

 # Get a list of all numerical column names #
 num_cols = raw_df.select_dtypes(include=[np.number]).columns.tolist()

 # For example: if you only want float and integer columns
 raw_df.select_dtypes(include = ['float','int'])

 ### b.Select and drop columns by names:
 raw_df_info = raw_df[['RegionID', 'RegionName', 'State', 'Metro', 'CountyName']]

 # drop columns by names
 raw_df_sub = raw_df_info.drop(['RegionID','RegionName'],axis=1)
 raw_df_sub.head(5)

 ## 2.2 Rename Columns ##
 # How do I rename the columns if I don't like them? For example, change 'State' to 'state_'; 'City' to 'city_':
 raw_df_renamed1 = raw_df.rename(columns= {'State':'state_', 'City':'city_})

 # If you need to change a lot of columns: this is easy for you to map the old and new names
 old_names = ['State', 'City']
 new_names = ['state_', 'city_']
 raw_df_renamed2 = raw_df.rename(columns=dict(zip(old_names, new_names))

 ###########################################
 # 3. Null Values: View, Delete and Impute #
 ###########################################

 ## 3.1 How many rows and columns have null values? ##
 # 3.11. For each column, are there any NaN values? 
 raw_df.isnull().any()

 # 3.12. For each column, how many rows are NaN?
 raw_df.isnull().sum()
 # the results for 1&2 are shown in the screenshot below this block

 # 3.13. How many columns have NaNs?
 raw_df.isnull().sum(axis=0).count() 
 # the result is 271. 
 # axis=0 is the default for operation across rows, so raw_df.isnull().sum().count() yields the same result

 # 3.14. Similarly, how many rows have NaNs?
 raw_df.isnull().sum(axis=1).count() 
 # the result is 1324
                                
 # 3.15 Select data that isn't null in one column, for example, 'Metro' isn't null.
 raw_df_metro = raw_df[pd.notnull(raw_df['Metro'])]
 # If we want to take a look at what cities have null metros
 raw_df[pd.isnull(raw_df['Metro'])].head(5)
                                
 ## 3.2 Select rows that are not null for a fixed set of columns ##
 # Select a subset of data that doesn't have null after 2000
 # If you want to select the data in July, you need to find the columns contain '-07'. 
 # To see if a string contains a substring, you can use substring in string, and it'll output true or false.
 # Drop NA rows based on a subset of columns: for example, drop the rows if it doesn't have 'State' and 'RegionName' info
 df_regions = raw_df.dropna(subset = ['State', 'RegionName']) 

 # Get the columns with data available after 2000: use <string>.startwith("string") function #
 cols_2000= [x for x in raw_df.columns.tolist() if '2000-' in x]
 raw_df.dropna(subset=cols_2000).head(5)
                                
 ## 3.3 Subset Rows by Null Values ##
 # Select rows where we want to have at least 50 non-NA values, but don't need to be specific about the columns                               
 # Drop the rows where at least one columns is NAs.
 # Method 1:
 raw_df.dropna()
 #It's the same as df.dropna(axis='columns', how = 'all')

 # Method 2:
 raw_df[raw_df.notnull()]

 # Only drop the rows if at least 50 columns are Nas
 not_null_50_df = raw_df.dropna(axis='columns', thresh=50)
 
 ## 3.4 Drop and Impute Missing Values ##                               
 # Fill NA or impute NA:
 #fill with 0:
 raw_df.fillna(0)

 #fill with missing:
 raw_df['State'].fillna('missing')

 #fill with mean or median:
 raw_df['2018-01'].fillna((raw_df['2018-01'].mean()),inplace=True)
 # inplace=True changes the original dataframe without assigning it to a column or dataframe
 # it's the same as raw_df['2018-01']=raw_df['2018-01'].fillna((raw_df['2018-01'].mean()),inplace=False)
                                
 # Use your own condition to fill using where function:
 # fill values with conditional assignment by using np.where
 # syntax df['column_name'] = np.where(statement, A, B) #
 # the value is A is the statement is True, otherwise it's B #
 # axis = 'columns' is the same as axis =1, it's an action across the rows along the column
 # axis = 'index' is the same as axis= 0;
 raw_df['2018-02'] = np.where(raw_df['2018-02'].notnull(), raw_df['2018-02'], raw_df['2017-02'].mean(), axis='columns') 

 #########################
 # 4. Data Deduplication #
 #########################
                                
 # Check duplicates #
 raw_df.duplicated()
 # output True/False values for each column

 raw_df.duplicated().sum()
 # for raw_df it's 0, meaning there's no duplication

 # Check if there's any duplicated values by column, output is True/False for each row
 raw_df.duplicated('RegionName')

 # Select the duplicated rows to see what they look like
 # keep = False marks all duplicated values as True so it only leaves the duplicated rows
 raw_df[raw_df['RegionName'].duplicated(keep=False)].sort_values('RegionName')

 # Drop duplicated rows #
 # syntax: df.drop_duplicates(subset =[list of columns], keep = 'first', 'last', False)
 unique_df = raw_df.drop_duplicates(subset = ['CountyName','SizeRank'], keep='first')
	# Build Muscle Memory for Data Science with Python [Part 1] #

	###############################
	# 0. Read, View and Save data #
	###############################

	# First, load the libraries for our exercise
	# Load libraries #
	import pandas as pd
	import numpy as np

	# Downloaded the data from Zillow.
	file_dir = "https://raw.githubusercontent.com/zhendata/Medium_Posts/master/City_Zhvi_1bedroom_2018_05.csv"

	# read csv file into a Pandas dataframe
	raw_df = pd.read_csv(file_dir)

	# check first 5 rows of the file
	raw_df.head(5)

	# Save csv:
	# use raw_df.tail(5) to see last 5 rows of the file

	# saving a file is dataframe.to_csv().
	# If you don't want to index number to be saved, use dataframe.to_csv( index = False )

	########################################
	# 1 . Table's Dimension and Data Types #
	########################################

	## 1.1 Dimension ##

	raw_df.shape
	# the results is a vector: (# of rows, # of cols)

	# Get the number of rows
	print(raw_df.shape[0])
	# column is raw_df.shape[1]

	## 1.2 Data Types ##

	# What are the data types of your data, and how many columns are numeric?
	# Check the data types of the entire table's columns
	raw_df.dtypes

	# Check the data type of a specific column
	raw_df['RegionID'].dtypes
	# result: dtype('int64')

	################################
	# 2. Basic Column Manipulation #
	################################

	## 2.1 Subset data by columns ##

	### a.Select columns by data types: ###
	# if you only want to include columns of float data
	raw_df.select_dtypes(include=['float64'])
	# Or to get numerical columns by excluding objects (non-numeric)
	raw_df.select_dtypes(exclude=['object'])

	# Get a list of all numerical column names #
	num_cols = raw_df.select_dtypes(include=[np.number]).columns.tolist()

	# For example: if you only want float and integer columns
	raw_df.select_dtypes(include = ['float','int'])

	### b.Select and drop columns by names:
	raw_df_info = raw_df[['RegionID', 'RegionName', 'State', 'Metro', 'CountyName']]

	# drop columns by names
	raw_df_sub = raw_df_info.drop(['RegionID','RegionName'],axis=1)
	raw_df_sub.head(5)

	## 2.2 Rename Columns ##
	# How do I rename the columns if I don't like them? For example, change 'State' to 'state_'; 'City' to 'city_':
	raw_df_renamed1 = raw_df.rename(columns= {'State':'state_', 'City':'city_})

	# If you need to change a lot of columns: this is easy for you to map the old and new names
	old_names = ['State', 'City']
	new_names = ['state_', 'city_']
	raw_df_renamed2 = raw_df.rename(columns=dict(zip(old_names, new_names))

	###########################################
	# 3. Null Values: View, Delete and Impute #
	###########################################

	## 3.1 How many rows and columns have null values? ##
	# 3.11. For each column, are there any NaN values?
	raw_df.isnull().any()

	# 3.12. For each column, how many rows are NaN?
	raw_df.isnull().sum()
	# the results for 1&2 are shown in the screenshot below this block

	# 3.13. How many columns have NaNs?
	raw_df.isnull().sum(axis=0).count()
	# the result is 271.
	# axis=0 is the default for operation across rows, so raw_df.isnull().sum().count() yields the same result

	# 3.14. Similarly, how many rows have NaNs?
	raw_df.isnull().sum(axis=1).count()
	# the result is 1324

	# 3.15 Select data that isn't null in one column, for example, 'Metro' isn't null.
	raw_df_metro = raw_df[pd.notnull(raw_df['Metro'])]
	# If we want to take a look at what cities have null metros
	raw_df[pd.isnull(raw_df['Metro'])].head(5)

	## 3.2 Select rows that are not null for a fixed set of columns ##
	# Select a subset of data that doesn't have null after 2000
	# If you want to select the data in July, you need to find the columns contain '-07'.
	# To see if a string contains a substring, you can use substring in string, and it'll output true or false.
	# Drop NA rows based on a subset of columns: for example, drop the rows if it doesn't have 'State' and 'RegionName' info
	df_regions = raw_df.dropna(subset = ['State', 'RegionName'])

	# Get the columns with data available after 2000: use <string>.startwith("string") function #
	cols_2000= [x for x in raw_df.columns.tolist() if '2000-' in x]
	raw_df.dropna(subset=cols_2000).head(5)

	## 3.3 Subset Rows by Null Values ##
	# Select rows where we want to have at least 50 non-NA values, but don't need to be specific about the columns
	# Drop the rows where at least one columns is NAs.
	# Method 1:
	raw_df.dropna()
	#It's the same as df.dropna(axis='columns', how = 'all')

	# Method 2:
	raw_df[raw_df.notnull()]

	# Only drop the rows if at least 50 columns are Nas
	not_null_50_df = raw_df.dropna(axis='columns', thresh=50)

	## 3.4 Drop and Impute Missing Values ##
	# Fill NA or impute NA:
	#fill with 0:
	raw_df.fillna(0)

	#fill with missing:
	raw_df['State'].fillna('missing')

	#fill with mean or median:
	raw_df['2018-01'].fillna((raw_df['2018-01'].mean()),inplace=True)
	# inplace=True changes the original dataframe without assigning it to a column or dataframe
	# it's the same as raw_df['2018-01']=raw_df['2018-01'].fillna((raw_df['2018-01'].mean()),inplace=False)

	# Use your own condition to fill using where function:
	# fill values with conditional assignment by using np.where
	# syntax df['column_name'] = np.where(statement, A, B) #
	# the value is A is the statement is True, otherwise it's B #
	# axis = 'columns' is the same as axis =1, it's an action across the rows along the column
	# axis = 'index' is the same as axis= 0;
	raw_df['2018-02'] = np.where(raw_df['2018-02'].notnull(), raw_df['2018-02'], raw_df['2017-02'].mean(), axis='columns')

	#########################
	# 4. Data Deduplication #
	#########################

	# Check duplicates #
	raw_df.duplicated()
	# output True/False values for each column

	raw_df.duplicated().sum()
	# for raw_df it's 0, meaning there's no duplication

	# Check if there's any duplicated values by column, output is True/False for each row
	raw_df.duplicated('RegionName')

	# Select the duplicated rows to see what they look like
	# keep = False marks all duplicated values as True so it only leaves the duplicated rows
	raw_df[raw_df['RegionName'].duplicated(keep=False)].sort_values('RegionName')

	# Drop duplicated rows #
	# syntax: df.drop_duplicates(subset =[list of columns], keep = 'first', 'last', False)
	unique_df = raw_df.drop_duplicates(subset = ['CountyName','SizeRank'], keep='first')
No results found