# 1. Bag of Words from sklearn.feature_extraction.text import CountVectorizer corpus = ["gato dormindo", "cachorro latindo", "gato e cachorro brincando"] vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) print("Vocabulário:", vectorizer.get_feature_names_out()) print("Matriz de contagem (Bag of Words):") print(X.toarray()) # 2. TF-IDF from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(corpus) print("Vocabulário:", vectorizer.get_feature_names_out()) print("Matriz TF-IDF:") print(X.toarray()) # 3. Word2Vec (Skip-gram) import gensim from gensim.models import Word2Vec from nltk.tokenize import word_tokenize import nltk nltk.download('punkt_tab') # Pré-processamento simples corpus = ["gato dormindo", "cachorro latindo", "gato e cachorro brincando"] tokenized = [word_tokenize(frase.lower()) for frase in corpus] # Treinando modelo Skip-gram model = Word2Vec(sentences=tokenized, vector_size=10, window=2, sg=1, min_count=1) # Vetores das palavras for word in model.wv.index_to_key: print(f"Vetor para '{word}': {model.wv[word]}")