Skip to content

Instantly share code, notes, and snippets.

all_words = []
for i in range(len(ca_df.index)):
for w in ca_df.iat[i, 1]:
all_words.append(w.lower())
for i in range(len(ny_df.index)):
for w in ny_df.iat[i, 1]:
all_words.append(w.lower())
for i in range(len(tx_df.index)):
results_new_ca = []
for i in range(len(ca_new_df['tweet_text'])):
feature = find_features(ca_new_df['tweet_text'][i])
results_new_ca.append(classifier.classify(feature))
results_new_ny = []
for i in range(len(ny_new_df['tweet_text'])):
feature = find_features(ny_new_df['tweet_text'][i])
training_set = feature_sets[:1200]
testing_set = feature_sets[1200:]
classifier = nltk.NaiveBayesClassifier.train(training_set)
results = []
for i in range(len(testing_set)):
results.append(classifier.classify(testing_set[i][0]))
error = 0
import random
dataset = []
ca_dataset = []
for row in ca_df.itertuples(index = False):
ca_dataset.append((getattr(row, 'tweet_text'), getattr(row, 'values')))
ny_dataset = []
for row in ny_df.itertuples(index = False):
ny_dataset.append((getattr(row, 'tweet_text'), getattr(row, 'values')))
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
def lemmatize_tweets(tweet):
tweet = lemmatizer.lemmatize(tweet)
return tweet
for i in range(len(ca_df['tweet_text'])):
import re
def cleanTxt(text):
text = re.sub('@[A-Za-z0–9]+', '', text)
text = re.sub('#', '', text)
text = re.sub('RT[\s]+', '', text)
text = re.sub('https?:\/\/\S+', '', text)
text = re.sub('/', '', text)
text = text.replace('\\', '')
text = re.sub('x97', '', text)
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(tweet):
tweet = [n for n in tweet if not n in stop_words]
return tweet
ca_df['tweet_text'] = ca_df['tweet_text'].apply(remove_stopwords)
import nltk
from nltk import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
def tokenize_tweets(tweet):
tweet = tweet_tokenizer.tokenize(tweet)
return tweet
ca_df['tweet_text'] = ca_df['tweet_text'].apply(tokenize_tweets)
ny_df['tweet_text'] = ny_df['tweet_text'].apply(tokenize_tweets)
import pandas as pd
ca_df = pd.read_csv('ca_labeled.csv', index_col=[0])
ny_df = pd.read_csv('ny_labeled.csv', index_col=[0])
tx_df = pd.read_csv('tx_labeled.csv', index_col=[0])
def features(value):
if(value == -1):
value = 'neg'
return value
import json
import csv
import tweepy
import re
def search_for_hashtags(consumer_key, consumer_secret, access_token, access_token_secret, hashtag_phrase):
#create authentication for accessing Twitter
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)