Skip to content

Instantly share code, notes, and snippets.

@jikanter
Last active January 15, 2025 19:01
Show Gist options
  • Select an option

  • Save jikanter/7ee1f29cec5332939f1a48e9f1bba479 to your computer and use it in GitHub Desktop.

Select an option

Save jikanter/7ee1f29cec5332939f1a48e9f1bba479 to your computer and use it in GitHub Desktop.
[dbt snippets] #ai #aigenerated #chatgpt #
# List of Highly Used Pandas Data Transforms
# 1. Loading Data
import pandas as pd
data = pd.read_csv('data.csv')
# 2. Basic Inspection
data.info()
data.describe()
data.head()
data.tail()
data.columns
# 3. Handling Missing Data
# Drop missing values
data.dropna(inplace=True)
# Fill missing values
data.fillna({'column_name': 0}, inplace=True)
# Check for missing values
data.isnull().sum()
# 4. Filtering Rows
# Filter rows by condition
filtered_data = data[data['column_name'] > 10]
# Filter rows with multiple conditions
filtered_data = data[(data['column1'] > 10) & (data['column2'] == 'value')]
# 5. Transforming Columns
# Apply function to a column
data['new_column'] = data['existing_column'].apply(lambda x: x * 2)
# Vectorized operations
data['new_column'] = data['existing_column'] * 2
# Renaming columns
data.rename(columns={'old_name': 'new_name'}, inplace=True)
# 6. Aggregations and Grouping
# Group by and aggregate
grouped_data = data.groupby('group_column').agg({'value_column': 'sum'})
# Reset index after grouping
grouped_data.reset_index(inplace=True)
# 7. Sorting Data
# Sort by column values
data.sort_values('column_name', ascending=True, inplace=True)
# Sort by multiple columns
data.sort_values(['column1', 'column2'], ascending=[True, False], inplace=True)
# 8. Merging and Joining
# Merge two dataframes on a key
merged_data = pd.merge(data1, data2, on='key_column', how='inner')
# Concatenate dataframes vertically or horizontally
concatenated_data = pd.concat([data1, data2], axis=0) # axis=1 for horizontal
# 9. Pivoting and Reshaping
# Pivot table
pivot_table = data.pivot_table(values='value_column', index='index_column', columns='columns_column', aggfunc='mean')
# Melt data
melted_data = pd.melt(data, id_vars=['id_column'], value_vars=['value_column'])
# 10. Working with Dates
# Convert column to datetime
data['date_column'] = pd.to_datetime(data['date_column'])
# Extract components of datetime
data['year'] = data['date_column'].dt.year
data['month'] = data['date_column'].dt.month
# 11. Exporting Data
# Export to CSV
data.to_csv('output.csv', index=False)
# Export to Excel
data.to_excel('output.xlsx', index=False)
# 12. Advanced Indexing
# Set index
data.set_index('index_column', inplace=True)
# Reset index
data.reset_index(inplace=True)
# Selecting by index
data.loc['row_label']
data.iloc[0]
# 13. Handling Duplicates
# Drop duplicates
data.drop_duplicates(subset=['column_name'], inplace=True)
# Check for duplicates
data.duplicated(subset=['column_name']).sum()
# 14. Data Type Conversions
# Convert column type
data['column_name'] = data['column_name'].astype('int')
# Convert multiple columns
data = data.astype({'col1': 'float', 'col2': 'int'})
# 15. Basic Text Manipulations
# Append text to a column
data['new_column'] = data['existing_column'] + '_suffix'
# Prepend text to a column
data['new_column'] = 'prefix_' + data['existing_column']
# Replace text in a column
data['new_column'] = data['existing_column'].str.replace('old_value', 'new_value', regex=True)
# Convert to lowercase
data['existing_column'] = data['existing_column'].str.lower()
# Convert to uppercase
data['existing_column'] = data['existing_column'].str.upper()
# Strip whitespace from a column
data['existing_column'] = data['existing_column'].str.strip()
# Extract substring from a column
data['substring'] = data['existing_column'].str[0:5]
# Check if a string contains a substring
data['contains_value'] = data['existing_column'].str.contains('value', na=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment