Skip to content

Instantly share code, notes, and snippets.

@0x7067
Created November 19, 2016 16:54
Show Gist options
  • Select an option

  • Save 0x7067/7c95df3ef3c313c4af3011a81dd38163 to your computer and use it in GitHub Desktop.

Select an option

Save 0x7067/7c95df3ef3c313c4af3011a81dd38163 to your computer and use it in GitHub Desktop.
[u'psic\xf3loga', u'hoje', u'deu', u'vontade', u'ler', u'romancezinho', u'bem', u'self', u'insert', u'esquecer', u'existo']
[]
[u'whoa', u'must', u'biggest', u'windows', u'blue', u'screen', u'death', u'ever', u'seen']
[u'whenever', u'see', u'someone', u'write', u'would']
[u'equipe', u'rocket', u'destruindo', u'padr\xf5es', u'g\xeanero', u'desde']
[u'want', u'relationship', u'even', u'though', u'women', u'shepard', u'miga', u'mina', u'\xe9', u'_azul_', u'talvez', u'atente', u'pra', u'antes', u'g\xeanero']
[u'adoro', u'\xe9', u'fucking', u'wrpg', u'ainda', u'assim', u'momento', u'b-but', u'girls', u'liara']
[]
[u'mass', u'effect']
[u'cena', u'ser', u'maravilhosa', u'nunca', u'mds', u'<3']
PS: Run filter_tweets.py with "python filter_tweets.txt >> after.txt"
psicóloga hoje me deu vontade de ler um romancezinho bem self insert e esquecer que eu existo.
rt AT_USER URL
rt AT_USER whoa! "this must be the biggest windows blue screen of death ever seen": URL
rt AT_USER me whenever i see someone write "should of" or "would of" URL
rt AT_USER equipe rocket destruindo padrões de gênero desde 1997 URL
you want a relationship with me? even though we're both women?" shepard, miga, a mina é _azul_. talvez atente pra isso antes do gênero
adoro que é um fucking wrpg e ainda assim tem o momento "b-but we're both girls!" com a liara
AT_USER s
AT_USER mass effect
essa cena não para de ser maravilhosa nunca mds &lt;3
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import re
import string
from collections import Counter
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
def normalize_contractions(tokens):
"""Example of normalization for English contractions.
Return: generator
"""
token_map = {
"i'm": "i am",
"you're": "you are",
"it's": "it is",
"we're": "we are",
"we'll": "we will",
"vc" : "voce",
"pq" : "porque"
}
for tok in tokens:
if tok in token_map.keys():
for item in token_map[tok].split():
yield item
else:
yield tok
def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
"""Process the text of a tweet:
- Lowercase
- Tokenize
- Stopword removal
- Digits removal
Return: list of strings
"""
tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
text = text.lower()
tokens = tokenizer.tokenize(text)
# If we want to normalize contraction, uncomment this
tokens = normalize_contractions(tokens)
return [tok for tok in tokens if tok not in stopwords and not tok.isdigit()]
tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
punct = list(string.punctuation)
custom_list = ['belo', 'horizonte', 'mg', 'at_user', 'url']
stopword_list = stopwords.words('portuguese') + stopwords.words('english') + punct + ['rt', 'via'] + custom_list
i = open('tweets_pre_processed.txt', 'r')
line = i.readline()
while line:
print(process(text=line, tokenizer=tknzr, stopwords=stopword_list))
line = i.readline()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment