Skip to content

Instantly share code, notes, and snippets.

@roshan-adusumilli
Created May 23, 2020 02:46
Show Gist options
  • Select an option

  • Save roshan-adusumilli/2858a503cb3efe6ec98b99caf3f31b21 to your computer and use it in GitHub Desktop.

Select an option

Save roshan-adusumilli/2858a503cb3efe6ec98b99caf3f31b21 to your computer and use it in GitHub Desktop.
import re
def cleanTxt(text):
text = re.sub('@[A-Za-z0–9]+', '', text)
text = re.sub('#', '', text)
text = re.sub('RT[\s]+', '', text)
text = re.sub('https?:\/\/\S+', '', text)
text = re.sub('/', '', text)
text = text.replace('\\', '')
text = re.sub('x97', '', text)
text = re.sub('xa3', '', text)
text = re.sub('x98People', '', text)
text = re.sub('x98', '', text)
text = re.sub('xa0', '', text)
text = re.sub('x94and', '', text)
text = re.sub('x96', '', text)
text = re.sub('x99s', '', text)
text = re.sub('x91', '', text)
text = re.sub('x8a', '', text)
text = re.sub('xba', '', text)
text = re.sub('x9b', '', text)
text = re.sub('xbc', '', text)
text = re.sub('x92', '', text)
text = re.sub('xbf', '', text)
text = re.sub('x89https', '', text)
text = re.sub('x94By', '', text)
text = re.sub('x8f', '', text)
text = re.sub('xb8', '', text)
text = re.sub('xa4', '', text)
text = re.sub('xa5', '', text)
text = re.sub('x87', '', text)
text = re.sub('xa5WOW', '', text)
text = re.sub('x94', '', text)
text = re.sub('x95', '', text)
text = re.sub('xb3', '', text)
text = re.sub('x89', '', text)
text = re.sub('x9f', '', text)
text = re.sub('x9ccoronavirus', '', text)
text = re.sub('xbd', '', text)
text = re.sub('x9cnatural', '', text)
text = re.sub('x9cmusic', '', text)
text = re.sub('xa9', '', text)
text = re.sub('x82', '', text)
text = re.sub('xc2', '', text)
text = re.sub('x83', '', text)
text = re.sub('x99all', '', text)
text = re.sub('xb1al', '', text)
text = re.sub('x9cessential', '', text)
text = re.sub('x9cEveryone', '', text)
text = re.sub('x8e', '', text)
text = re.sub('x98Reopen', '', text)
text = re.sub('xe3', '', text)
text = re.sub('xa2', '', text)
text = re.sub('x80', '', text)
text = re.sub('x99m', '', text)
text = re.sub('x90', '', text)
text = re.sub('x9e', '', text)
text = re.sub('x99', '', text)
text = re.sub('xb9', '', text)
text = re.sub('xbb', '', text)
text = re.sub('x99re', '', text)
text = re.sub('xa3https', '', text)
text = re.sub('x98Burden', '', text)
text = re.sub('x9cprogressives', '', text)
text = re.sub('xb1d19', '', text)
text = re.sub('xaa', '', text)
text = re.sub('x86', '', text)
text = re.sub('x8c', '', text)
text = re.sub('x93', '', text)
text = re.sub('x9d', '', text)
text = re.sub('x88', '', text)
text = re.sub('x99t', '', text)
text = re.sub('xef', '', text)
text = re.sub('xf0', '', text)
text = re.sub('xa7', '', text)
text = re.sub('xb7', '', text)
text = re.sub('x9cThe', '', text)
text = re.sub('x9c', '', text)
text = re.sub('x99mon', '', text)
text = re.sub('x99d', '', text)
text = re.sub('xb5', '', text)
text = re.sub('xc3', '', text)
text = re.sub('xe2', '', text)
text = re.sub('x8d', '', text)
text = re.sub('xb0', '', text)
text = re.sub('xa6it', '', text)
text = re.sub('x98CA', '', text)
text = re.sub('xc4', '', text)
text = re.sub('xa8', '', text)
text = re.sub('x9cthe', '', text)
text = re.sub('x99ve', '', text)
text = re.sub('x81', '', text)
text = re.sub('x8fTake', '', text)
text = re.sub('x85', '', text)
text = re.sub('x99S', '', text)
text = re.sub('xb8OPEN', '', text)
text = re.sub('xa6', '', text)
text = re.sub('x8fUplifting', '', text)
text = re.sub('xb8TYRANT', '', text)
text = re.sub('xac', '', text)
text = re.sub('x99ll', '', text)
text = re.sub('x9cfix', '', text)
text = re.sub('x98declared', '', text)
text = re.sub('xa1', '', text)
text = re.sub('x98fix', '', text)
return text
for i in range(len(ca_df['tweet_text'])):
for n in range(len(ca_df['tweet_text'][i])):
ca_df['tweet_text'][i][n] = cleanTxt(ca_df['tweet_text'][i][n])
for i in range(len(ny_df['tweet_text'])):
for n in range(len(ny_df['tweet_text'][i])):
ny_df['tweet_text'][i][n] = cleanTxt(ny_df['tweet_text'][i][n])
for i in range(len(tx_df['tweet_text'])):
for n in range(len(tx_df['tweet_text'][i])):
tx_df['tweet_text'][i][n] = cleanTxt(tx_df['tweet_text'][i][n])
ca_df.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment