Created
May 23, 2020 15:24
-
-
Save roshan-adusumilli/5eef0021d9843a979f295e0b376c506a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| all_words = [] | |
| for i in range(len(ca_df.index)): | |
| for w in ca_df.iat[i, 1]: | |
| all_words.append(w.lower()) | |
| for i in range(len(ny_df.index)): | |
| for w in ny_df.iat[i, 1]: | |
| all_words.append(w.lower()) | |
| for i in range(len(tx_df.index)): | |
| for w in tx_df.iat[i, 1]: | |
| all_words.append(w.lower()) | |
| all_words = nltk.FreqDist(all_words) | |
| word_features = [item[0] for item in all_words.most_common(10000)] | |
| def find_features(tweet): | |
| words = set(tweet) | |
| features = {} | |
| for w in word_features: | |
| features[w] = (w in words) | |
| return features | |
| feature_sets = [(find_features(tweet), value) for (tweet, value) in dataset] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment