0x7067 · November 19, 2016 16:54
diff --git a/after.txt b/after.txt
 [u'psic\xf3loga', u'hoje', u'deu', u'vontade', u'ler', u'romancezinho', u'bem', u'self', u'insert', u'esquecer', u'existo']
 []
 [u'whoa', u'must', u'biggest', u'windows', u'blue', u'screen', u'death', u'ever', u'seen']
 [u'whenever', u'see', u'someone', u'write', u'would']
 [u'equipe', u'rocket', u'destruindo', u'padr\xf5es', u'g\xeanero', u'desde']
 [u'want', u'relationship', u'even', u'though', u'women', u'shepard', u'miga', u'mina', u'\xe9', u'_azul_', u'talvez', u'atente', u'pra', u'antes', u'g\xeanero']
 [u'adoro', u'\xe9', u'fucking', u'wrpg', u'ainda', u'assim', u'momento', u'b-but', u'girls', u'liara']
 []
 [u'mass', u'effect']
 [u'cena', u'ser', u'maravilhosa', u'nunca', u'mds', u'<3']


 PS: Run filter_tweets.py with "python filter_tweets.txt >> after.txt"
diff --git a/before.txt b/before.txt
 psicóloga hoje me deu vontade de ler um romancezinho bem self insert e esquecer que eu existo. 
 rt AT_USER URL 
 rt AT_USER whoa! "this must be the biggest windows blue screen of death ever seen": URL 
 rt AT_USER me whenever i see someone write "should of" or "would of" URL 
 rt AT_USER equipe rocket destruindo padrões de gênero desde 1997 URL 
 you want a relationship with me? even though we're both women?" shepard, miga, a mina é _azul_. talvez atente pra isso antes do gênero 
 adoro que é um fucking wrpg e ainda assim tem o momento "b-but we're both girls!" com a liara 
 AT_USER s 
 AT_USER mass effect 
 essa cena não para de ser maravilhosa nunca mds &lt;3 
diff --git a/filter_tweets.py b/filter_tweets.py
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-

 import json
 import re
 import string
 from collections import Counter
 from nltk.tokenize import TweetTokenizer
 from nltk.corpus import stopwords

 def normalize_contractions(tokens):
    """Example of normalization for English contractions.

    Return: generator
    """
    token_map = {
        "i'm": "i am",
        "you're": "you are",
        "it's": "it is",
        "we're": "we are",
        "we'll": "we will",
        "vc" : "voce",
        "pq" : "porque"
    }
    for tok in tokens:
        if tok in token_map.keys():
            for item in token_map[tok].split():
                yield item
        else:
            yield tok

 def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
    """Process the text of a tweet:
    - Lowercase
    - Tokenize
    - Stopword removal
    - Digits removal
    Return: list of strings
    """
    tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
    text = text.lower()
    tokens = tokenizer.tokenize(text)
    # If we want to normalize contraction, uncomment this
    tokens = normalize_contractions(tokens)
    return [tok for tok in tokens if tok not in stopwords and not tok.isdigit()]

 tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
 punct = list(string.punctuation)
 custom_list = ['belo', 'horizonte', 'mg', 'at_user', 'url']
 stopword_list = stopwords.words('portuguese') + stopwords.words('english') + punct + ['rt', 'via'] + custom_list

 i = open('tweets_pre_processed.txt', 'r')
 line = i.readline()
 while line:
 		print(process(text=line, tokenizer=tknzr, stopwords=stopword_list))
 		line = i.readline()
	[u'psic\xf3loga', u'hoje', u'deu', u'vontade', u'ler', u'romancezinho', u'bem', u'self', u'insert', u'esquecer', u'existo']
	[]
	[u'whoa', u'must', u'biggest', u'windows', u'blue', u'screen', u'death', u'ever', u'seen']
	[u'whenever', u'see', u'someone', u'write', u'would']
	[u'equipe', u'rocket', u'destruindo', u'padr\xf5es', u'g\xeanero', u'desde']
	[u'want', u'relationship', u'even', u'though', u'women', u'shepard', u'miga', u'mina', u'\xe9', u'_azul_', u'talvez', u'atente', u'pra', u'antes', u'g\xeanero']
	[u'adoro', u'\xe9', u'fucking', u'wrpg', u'ainda', u'assim', u'momento', u'b-but', u'girls', u'liara']
	[]
	[u'mass', u'effect']
	[u'cena', u'ser', u'maravilhosa', u'nunca', u'mds', u'<3']


	PS: Run filter_tweets.py with "python filter_tweets.txt >> after.txt"
	psicóloga hoje me deu vontade de ler um romancezinho bem self insert e esquecer que eu existo.
	rt AT_USER URL
	rt AT_USER whoa! "this must be the biggest windows blue screen of death ever seen": URL
	rt AT_USER me whenever i see someone write "should of" or "would of" URL
	rt AT_USER equipe rocket destruindo padrões de gênero desde 1997 URL
	you want a relationship with me? even though we're both women?" shepard, miga, a mina é _azul_. talvez atente pra isso antes do gênero
	adoro que é um fucking wrpg e ainda assim tem o momento "b-but we're both girls!" com a liara
	AT_USER s
	AT_USER mass effect
	essa cena não para de ser maravilhosa nunca mds <3
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import json
	import re
	import string
	from collections import Counter
	from nltk.tokenize import TweetTokenizer
	from nltk.corpus import stopwords

	def normalize_contractions(tokens):
	"""Example of normalization for English contractions.

	Return: generator
	"""
	token_map = {
	"i'm": "i am",
	"you're": "you are",
	"it's": "it is",
	"we're": "we are",
	"we'll": "we will",
	"vc" : "voce",
	"pq" : "porque"
	}
	for tok in tokens:
	if tok in token_map.keys():
	for item in token_map[tok].split():
	yield item
	else:
	yield tok

	def process(text, tokenizer=TweetTokenizer(), stopwords=[]):
	"""Process the text of a tweet:
	- Lowercase
	- Tokenize
	- Stopword removal
	- Digits removal
	Return: list of strings
	"""
	tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)
	text = text.lower()
	tokens = tokenizer.tokenize(text)
	# If we want to normalize contraction, uncomment this
	tokens = normalize_contractions(tokens)
	return [tok for tok in tokens if tok not in stopwords and not tok.isdigit()]

	tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
	punct = list(string.punctuation)
	custom_list = ['belo', 'horizonte', 'mg', 'at_user', 'url']
	stopword_list = stopwords.words('portuguese') + stopwords.words('english') + punct + ['rt', 'via'] + custom_list

	i = open('tweets_pre_processed.txt', 'r')
	line = i.readline()
	while line:
	print(process(text=line, tokenizer=tknzr, stopwords=stopword_list))
	line = i.readline()