Created
April 14, 2020 12:10
-
-
Save bd317/ab9000dd6889d50cd324de1d63346c60 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #load data sets | |
| train_df = pd.read_csv("../input/covid19-global-forecasting-week-4/train.csv") | |
| test_df = pd.read_csv("../input/covid19-global-forecasting-week-4/test.csv") | |
| submission = pd.read_csv("../input/covid19-global-forecasting-week-4/submission.csv") | |
| #check for missings | |
| display(train_df.isnull().sum()/len(train_df)*100) | |
| display(test_df.isnull().sum()/len(test_df)*100) | |
| #Id 0.000000 | |
| #Province_State 57.507987 | |
| #Country_Region 0.000000 | |
| #Date 0.000000 | |
| #ConfirmedCases 0.000000 | |
| #Fatalities 0.000000 | |
| #dtype: float64 | |
| #ForecastId 0.000000 | |
| #Province_State 57.507987 | |
| #Country_Region 0.000000 | |
| #Date 0.000000 | |
| #dtype: float64 | |
| #check for data range in both datasets | |
| print("The lowest date in the train data set is {} and the highest {}.".format(train_df['Date'].min(),train_df['Date'].max())) | |
| print("The lowest date in the test data set is {} and the highest {}.".format(test_df['Date'].min(),test_df['Date'].max())) | |
| #The lowest date in the train data set is 2020-01-22 and the highest 2020-04-13. | |
| #The lowest date in the test data set is 2020-04-02 and the highest 2020-05-14. | |
| #just some cosmetic renaming | |
| train_df.rename(columns={'Province_State':'State','Country_Region':'Country'}, inplace=True) | |
| test_df.rename(columns={'Province_State':'State','Country_Region':'Country'}, inplace=True) | |
| #function for replacing all the missings in the state column | |
| def missings(state, country): | |
| return country if pd.isna(state) == True else state | |
| #if there are no states specified for a country, the missing is replaced with the country´s name | |
| train_df['State'] = train_df.apply(lambda x: missings(x['State'],x['Country']),axis=1) | |
| test_df['State'] = test_df.apply(lambda x: missings(x['State'],x['Country']),axis=1) | |
| print("In our data set are {} countries and {} states.".format(train_df['Country'].nunique(),train_df['State'].nunique())) | |
| #In our data set are 184 countries and 312 states. | |
| df_confirmedcases = train_df.groupby(['Country','State']).max().groupby('Country').sum().sort_values(by='ConfirmedCases', ascending=False).reset_index().drop(columns='Id') | |
| df_confirmedcases[:20].set_index('Country').style.background_gradient(cmap='Oranges') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment