georgedevasia · July 6, 2018 09:04
diff --git a/Pandas_cheatsheet.py b/Pandas_cheatsheet.py
 import pandas as pd

 # fix SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Either of following
 pd.options.mode.chained_assignment = None  # default='warn'
 df.is_copy = False


 # read big csv
 df = pd.read_csv(FILE_PATH, sep='\t', comment = '#', chunksize=1000, \
                low_memory=False, iterator = True, compression='gzip')
 df = pd.concat(list(df), ignore_index=True)

 # select rows if values in list
 df = df.loc[df['COLUMN_NAME'].isin([LIST_PATTERN])]
 df = df[df['COL1'].isin([LIST_PATTERN]) & df['COL2'].isin([LIST_PATTERN])]

 # select rows if value == string
 df = df.loc[df['COL_NAME'] == 'STRING'] 

 # select rows if not null
 df = df[df['COL_NAME'].notnull()]

 # select rows containing string
 df = df[df['COL_NAME'].str.contains('STRING|STRING', na=False)]

 # Merge concat
 df_merged = pd.concat([df_1, df_2], ignore_index=True)

 # working with text in column
 df['COL_NAME'] = df.COL_NAME.str.split('.', expand=True)[0]

 # fillna
 df.fillna(value='-', inplace = True)

 # drop columns
 df.drop(['COL_NAMES'], axis=1, inplace=True)

 # Sort within group
 df = df.groupby(['COL_NAME']).apply(lambda x: x.sort_values(['COL_NAME'], ascending = False)).reset_index(drop=True)

 # rename cols
 df.rename(columns={'FROM_COL':'TO_COL'}, inplace=True)

 # write csv file
 df.to_csv('FILE_PATH', sep='\t', index=False)

 # insert column at position
 df.insert(idx, col_name, value)

 # create col based on condition 

 def f(row):
    if row['COL_1'] == '-' and row['COL2'] == 0:
        val = 'SOME_VAL'
    return val

 df['COL_3'] = df.apply(f, axis=1)

 # read excel 
 df = pd.read_excel(EXCEL, sheetname = 'SHEET1', skiprows=2, header=1)

 # concat lists of df based on cols
 df = pd.concat(dfs, axis=1, names=[LIST_COLS]) 
 df = df.loc[:,~df.columns.duplicated()] 

 # replace
 df['COL'].replace('FROM', 'TO', inplace=True)

 # drop dups
 df.drop_duplicates(subset=['COL'], inplace=True)

 # sort
 df.sort_values(by=['COL'], inplace=True)

 # from dict to df
 df = pd.DataFrame(list(d.items()), columns=['COL1', 'COL2'])

 # groupby and join in list
 df = df.groupby('COL', as_index=False).aggregate(lambda x: ', '.join(list(x)))

 # add suffix
 df = df.add_suffix('_some_suffix')

 # split and stack list in a column 
 s = df['COL'].apply(pd.Series,1).stack().reset_index()
 s.index = s.level_0
 del s['level_0']
 del s['level_1']
 df = df.join(s)


 # plot histograms
 df.hist(column='COLS', bins=50)

 # astype
 df['COL_3'] = df['COL_3'].astype(int, errors='ignore')

 # apply 
 df.apply(lambda x : str(x['COL1']) + x['COL2'], 1)

 # split col in 2
 df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

 # odereddir
 from collections import OrderedDict
 oderded_dir = OrderedDict(zip([LIST1], [LIST2]))

 # file exists
 os.path.exists(FILE_PATH)

 # writing excel 
 def set_format(df, worksheet1):
    '''
    set column width in excel sheet based on len(column)
    df ->
    '''
    for i, col in enumerate(df.columns):
        column_len = df[col].astype(str).str.len().max()
        column_len = max(column_len, len(col)) + 2
        if column_len > 20:
            column_len = len(col) +2
        worksheet1.set_column(i,i,column_len)
 if not os.path.exists(FILE_NAME): 
    writer = pd.ExcelWriter(FILE_NAME, engine = 'xlsxwriter')
    df.to_excel(writer, SHEET_NAME, index = False, startrow = 2)#, float_format ="%.2g")
    workbook = writer.book
    worksheet1 = writer.sheets[SHEET_NAME]
    worksheet1.set_zoom(110)
    set_format(df, worksheet1)
    writer.save()
    print('Completed writing report !!!')
 else:
    print('Report exists !!!')
    
 #Conditional format excel:  http://xlsxwriter.readthedocs.io/working_with_conditional_formats.html
	import pandas as pd

	# fix SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Either of following
	pd.options.mode.chained_assignment = None # default='warn'
	df.is_copy = False


	# read big csv
	df = pd.read_csv(FILE_PATH, sep='\t', comment = '#', chunksize=1000, \
	low_memory=False, iterator = True, compression='gzip')
	df = pd.concat(list(df), ignore_index=True)

	# select rows if values in list
	df = df.loc[df['COLUMN_NAME'].isin([LIST_PATTERN])]
	df = df[df['COL1'].isin([LIST_PATTERN]) & df['COL2'].isin([LIST_PATTERN])]

	# select rows if value == string
	df = df.loc[df['COL_NAME'] == 'STRING']

	# select rows if not null
	df = df[df['COL_NAME'].notnull()]

	# select rows containing string
	df = df[df['COL_NAME'].str.contains('STRING\|STRING', na=False)]

	# Merge concat
	df_merged = pd.concat([df_1, df_2], ignore_index=True)

	# working with text in column
	df['COL_NAME'] = df.COL_NAME.str.split('.', expand=True)[0]

	# fillna
	df.fillna(value='-', inplace = True)

	# drop columns
	df.drop(['COL_NAMES'], axis=1, inplace=True)

	# Sort within group
	df = df.groupby(['COL_NAME']).apply(lambda x: x.sort_values(['COL_NAME'], ascending = False)).reset_index(drop=True)

	# rename cols
	df.rename(columns={'FROM_COL':'TO_COL'}, inplace=True)

	# write csv file
	df.to_csv('FILE_PATH', sep='\t', index=False)

	# insert column at position
	df.insert(idx, col_name, value)

	# create col based on condition

	def f(row):
	if row['COL_1'] == '-' and row['COL2'] == 0:
	val = 'SOME_VAL'
	return val

	df['COL_3'] = df.apply(f, axis=1)

	# read excel
	df = pd.read_excel(EXCEL, sheetname = 'SHEET1', skiprows=2, header=1)

	# concat lists of df based on cols
	df = pd.concat(dfs, axis=1, names=[LIST_COLS])
	df = df.loc[:,~df.columns.duplicated()]

	# replace
	df['COL'].replace('FROM', 'TO', inplace=True)

	# drop dups
	df.drop_duplicates(subset=['COL'], inplace=True)

	# sort
	df.sort_values(by=['COL'], inplace=True)

	# from dict to df
	df = pd.DataFrame(list(d.items()), columns=['COL1', 'COL2'])

	# groupby and join in list
	df = df.groupby('COL', as_index=False).aggregate(lambda x: ', '.join(list(x)))

	# add suffix
	df = df.add_suffix('_some_suffix')

	# split and stack list in a column
	s = df['COL'].apply(pd.Series,1).stack().reset_index()
	s.index = s.level_0
	del s['level_0']
	del s['level_1']
	df = df.join(s)


	# plot histograms
	df.hist(column='COLS', bins=50)

	# astype
	df['COL_3'] = df['COL_3'].astype(int, errors='ignore')

	# apply
	df.apply(lambda x : str(x['COL1']) + x['COL2'], 1)

	# split col in 2
	df['new_col1'], df['new_col2'] = zip(*df['original_col'].apply(lambda x: x.split(': ', 1)))

	# odereddir
	from collections import OrderedDict
	oderded_dir = OrderedDict(zip([LIST1], [LIST2]))

	# file exists
	os.path.exists(FILE_PATH)

	# writing excel
	def set_format(df, worksheet1):
	'''
	set column width in excel sheet based on len(column)
	df ->
	'''
	for i, col in enumerate(df.columns):
	column_len = df[col].astype(str).str.len().max()
	column_len = max(column_len, len(col)) + 2
	if column_len > 20:
	column_len = len(col) +2
	worksheet1.set_column(i,i,column_len)
	if not os.path.exists(FILE_NAME):
	writer = pd.ExcelWriter(FILE_NAME, engine = 'xlsxwriter')
	df.to_excel(writer, SHEET_NAME, index = False, startrow = 2)#, float_format ="%.2g")
	workbook = writer.book
	worksheet1 = writer.sheets[SHEET_NAME]
	worksheet1.set_zoom(110)
	set_format(df, worksheet1)
	writer.save()
	print('Completed writing report !!!')
	else:
	print('Report exists !!!')

	#Conditional format excel: http://xlsxwriter.readthedocs.io/working_with_conditional_formats.html
No results found