Last active
August 29, 2015 13:56
-
-
Save glebkuznetsov/9063958 to your computer and use it in GitHub Desktop.
WTF is pandas .join() useful for?
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "metadata": { | |
| "name": "" | |
| }, | |
| "nbformat": 3, | |
| "nbformat_minor": 0, | |
| "worksheets": [ | |
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# WTF is `.join()` useful for in pandas?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "import pandas as pd" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [], | |
| "prompt_number": 1 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "The simplest example is lumping together data of the same size:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "name_df = pd.DataFrame({'name': ['Alice', 'Bob', 'Carl']})\n", | |
| "age_df = pd.DataFrame({'age': [22, 23, 27]})\n", | |
| "name_age_df = name_df.join(age_df)\n", | |
| "name_age_df" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>name</th>\n", | |
| " <th>age</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td> Alice</td>\n", | |
| " <td> 22</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td> Bob</td>\n", | |
| " <td> 23</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td> Carl</td>\n", | |
| " <td> 27</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 2, | |
| "text": [ | |
| " name age\n", | |
| "0 Alice 22\n", | |
| "1 Bob 23\n", | |
| "2 Carl 27" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 2 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "However, `join()` doesn't work as I would expect:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "name_color_df = pd.DataFrame({'name': ['Alice', 'Bob', 'Carl'], 'color': ['blue', 'white', 'green']})\n", | |
| "try:\n", | |
| " name_color_df.join(name_age_df, on='name', how='inner')\n", | |
| "except Exception:\n", | |
| " print 'Incomprehensible exception.'" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "stream": "stdout", | |
| "text": [ | |
| "Incomprehensible exception.\n" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 3 | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "So far, I've found one way it's useful: adding columns to a DataFrame that are a result of a function applied to each row." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "collapsed": false, | |
| "input": [ | |
| "def get_more_cols(row):\n", | |
| " return pd.Series({\n", | |
| " 'age+1': row['age'] + 1,\n", | |
| " 'age+2': row['age'] + 2,\n", | |
| " 'age+3': row['age'] + 3\n", | |
| " })\n", | |
| "name_age_df.join(name_age_df.apply(get_more_cols, axis=1))" | |
| ], | |
| "language": "python", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "html": [ | |
| "<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>name</th>\n", | |
| " <th>age</th>\n", | |
| " <th>age+1</th>\n", | |
| " <th>age+2</th>\n", | |
| " <th>age+3</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td> Alice</td>\n", | |
| " <td> 22</td>\n", | |
| " <td> 23</td>\n", | |
| " <td> 24</td>\n", | |
| " <td> 25</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td> Bob</td>\n", | |
| " <td> 23</td>\n", | |
| " <td> 24</td>\n", | |
| " <td> 25</td>\n", | |
| " <td> 26</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td> Carl</td>\n", | |
| " <td> 27</td>\n", | |
| " <td> 28</td>\n", | |
| " <td> 29</td>\n", | |
| " <td> 30</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "metadata": {}, | |
| "output_type": "pyout", | |
| "prompt_number": 4, | |
| "text": [ | |
| " name age age+1 age+2 age+3\n", | |
| "0 Alice 22 23 24 25\n", | |
| "1 Bob 23 24 25 26\n", | |
| "2 Carl 27 28 29 30" | |
| ] | |
| } | |
| ], | |
| "prompt_number": 4 | |
| } | |
| ], | |
| "metadata": {} | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment