Skip to content

Instantly share code, notes, and snippets.

@tspannhw
Created September 13, 2024 22:51
Show Gist options
  • Select an option

  • Save tspannhw/c33e302f68d790c870b9419bba2e506f to your computer and use it in GitHub Desktop.

Select an option

Save tspannhw/c33e302f68d790c870b9419bba2e506f to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"id": "ed1f93f0-fe8f-4f5d-998d-5383bd2db6ab",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from pymilvus import MilvusClient\n",
"from langchain_community.document_loaders import PyPDFLoader\n",
"from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
"from langchain.chains import RetrievalQA\n",
"from langchain_community.embeddings import HuggingFaceEmbeddings\n",
"from langchain.callbacks.manager import CallbackManager\n",
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
"from langchain_community.llms import Ollama\n",
"from langchain import hub\n",
"from langchain_huggingface import HuggingFaceEmbeddings\n",
"from langchain_community.vectorstores import Milvus\n",
"from langchain import hub\n",
"from langchain.chains import RetrievalQA\n",
"from langchain_community.llms import Ollama\n",
"from langchain.callbacks.manager import CallbackManager\n",
"from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler\n",
"import requests\n",
"import sys\n",
"import io\n",
"import json\n",
"import shutil\n",
"import sys\n",
"import datetime\n",
"import subprocess\n",
"import math\n",
"import base64\n",
"from time import gmtime, strftime\n",
"import random, string\n",
"import time\n",
"import base64\n",
"import uuid\n",
"import socket\n",
"from pymilvus import connections\n",
"from pymilvus import utility\n",
"from pymilvus import FieldSchema, CollectionSchema, DataType, Collection\n",
"from pymilvus import MilvusClient\n",
"from slack_sdk import WebClient\n",
"from slack_sdk.errors import SlackApiError\n",
"from dotenv import load_dotenv\n",
"load_dotenv(verbose=True)\n",
"from datetime import datetime, timedelta\n",
"from langchain_milvus import Milvus\n",
"\n",
"DIMENSION = 384 \n",
"MILVUS_URL = os.environ.get(\"SERVERLESS_VECTORDB_URI\")\n",
"TOKEN = os.environ.get(\"ZILLIZ_TOKEN\")\n",
"\n",
"slack_token = os.environ.get(\"SLACK_BOT_TOKEN\")\n",
"client = WebClient(token=slack_token)\n",
"embeddings = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
"\n",
"COLLECTION_NAME = \"mediumarticleskey\"\n",
"TODAYS_DATE = str( datetime.today().strftime('%Y-%m-%d') ) \n",
"YESTERDAYS_DATE = (datetime.now() - timedelta(1)).strftime('%Y-%m-%d')\n",
"\n",
"os.environ[\"LANGCHAIN_HUB_API_URL\"] = \"https://api.hub.langchain.com\"\n",
"os.environ[\"LANGCHAIN_HUB_API_KEY\"] = os.environ.get(\"LANGCHAIN_HUB_API_KEY\")\n",
"os.environ[\"LANGCHAIN_API_KEY\"] = os.environ.get(\"LANGCHAIN_HUB_API_KEY\")\n",
"os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n",
"os.environ[\"LANGCHAIN_ENDPOINT\"] = \"https://api.hub.langchain.com\"\n",
"os.environ[\"TOKENIZERS_PARALLELISM\"] = \"true\"\n",
"\n",
"### Turn off slack warnings\n",
"os.environ[\"SKIP_SLACK_SDK_WARNING\"] = \"false\"\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3a797d41-4126-4ee9-9c8c-40c535feabe8",
"metadata": {},
"outputs": [],
"source": [
"# -----------------------------------------------------------------------------\n",
"# Connect to Milvus\n",
"# https://zilliz.com/blog/a-beginners-guide-to-using-llama-3-with-ollama-milvus-langchain\n",
"# https://github.com/stephen37/ollama_local_rag/blob/main/rag_berlin_parliament.py\n",
"# https://python.langchain.com/v0.2/docs/integrations/vectorstores/milvus/\n",
"# https://api.python.langchain.com/en/latest/vectorstores/langchain_milvus.vectorstores.milvus.Milvus.html\n",
"\n",
"vector_store = Milvus(\n",
" embedding_function=embeddings,\n",
" collection_name=COLLECTION_NAME,\n",
" primary_field = \"doc_id\",\n",
" text_field=\"doc_text\",\n",
" vector_field=\"doc_embedding\",\n",
" connection_args={\"uri\": MILVUS_URL, \"token\": TOKEN},\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "cc2e7d5b-5830-4b92-a6e3-9880a3eb8225",
"metadata": {},
"outputs": [],
"source": [
"def run_query() -> None:\n",
" llm = Ollama(\n",
" model=\"llama3.1\",\n",
" callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),\n",
" stop=[\"<|eot_id|>\"],\n",
" )\n",
"\n",
" query = input(\"\\nAsk the medium articles: \")\n",
" qa_chain = RetrievalQA.from_chain_type(\n",
" llm, retriever=vector_store.as_retriever(collection = COLLECTION_NAME))\n",
"\n",
" result = qa_chain.invoke({\"query\": query})\n",
" \n",
" resultforslack = str(result[\"result\"])\n",
" print(resultforslack)\n",
"\n",
" try:\n",
" response = client.chat_postMessage(mrkdwn=True, channel=\"C06NE1FU6SE\", text=\"\", \n",
" blocks=[{\"type\": \"section\",\"text\": {\"type\": \"mrkdwn\",\"text\": \"*\" + str(query) + \"* \\n\\n\" + str(resultforslack) +\"\\n\" }}])\n",
"\n",
" except SlackApiError as e:\n",
" # You will get a SlackApiError if \"ok\" is False\n",
" print(\"Slack failed\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "83cc9bd7-f32e-4b67-bc2d-5441e939782d",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/c2/6wq9klzn19d97gvw8f8j1jdr0000gn/T/ipykernel_4256/3227108844.py:2: DeprecationWarning: callback_manager is deprecated. Please use callbacks instead.\n",
" llm = Ollama(\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"\n",
"Search our articles: How can I store Air Quality in Milvus?\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"To store air quality data in Milvus, you would need to create a collection with fields that represent the different parameters of air quality, such as pollutant concentrations (e.g., PM2.5, PM10, NO2, O3, etc.) and other relevant measurements.\n",
"\n",
"Here's a general outline of how you can achieve this:\n",
"\n",
"1. **Create a schema**: Define the structure of your data by specifying the fields you want to store in Milvus. For air quality data, this might include:\n",
"\t* pollutant names (e.g., PM2.5, PM10, NO2, O3)\n",
"\t* measurement values\n",
"\t* timestamp or date information\n",
"\t* location or geographical coordinates\n",
"2. **Insert data into the collection**: Use Milvus's insert functionality to add air quality data to your collection. You can do this by passing a list of dictionaries, where each dictionary represents an air quality reading.\n",
"3. **Configure vector dimensions and distance calculation**: Since air quality data is typically not numerical vectors, you would need to transform your data into a suitable format for Milvus's similarity search capabilities. This might involve:\n",
"\t* Converting pollutant concentrations into numerical values (e.g., using a lookup table or mathematical functions)\n",
"\t* Creating a vector representation of each air quality reading\n",
"4. **Perform similarity searches**: Once your collection is populated, you can perform similarity searches to find air quality readings that are similar to a given query.\n",
"\n",
"Here's an example of how you could create a collection and insert data:\n",
"```python\n",
"import milvus\n",
"\n",
"# Create a collection with fields for pollutant names, measurement values, timestamp, and location\n",
"collection = milvus.create_collection(\n",
" name='air_quality',\n",
" dimension=10,\n",
" description='Air quality data'\n",
")\n",
"\n",
"# Insert air quality data into the collection\n",
"data = [\n",
" {'pollutant': 'PM2.5', 'value': 12, 'timestamp': '2023-03-01 00:00:00', 'location': [48.8566, 2.3522]},\n",
" {'pollutant': 'NO2', 'value': 25, 'timestamp': '2023-03-02 00:00:00', 'location': [48.8567, 2.3532]}\n",
"]\n",
"\n",
"collection.insert(data)\n",
"```\n",
"Please note that this is a simplified example and might need to be adjusted based on your specific use case.\n",
"\n",
"If you want to perform similarity searches on the inserted data, you would also need to configure Milvus to calculate distances using an appropriate distance metric (e.g., Euclidean, cosine similarity).To store air quality data in Milvus, you would need to create a collection with fields that represent the different parameters of air quality, such as pollutant concentrations (e.g., PM2.5, PM10, NO2, O3, etc.) and other relevant measurements.\n",
"\n",
"Here's a general outline of how you can achieve this:\n",
"\n",
"1. **Create a schema**: Define the structure of your data by specifying the fields you want to store in Milvus. For air quality data, this might include:\n",
"\t* pollutant names (e.g., PM2.5, PM10, NO2, O3)\n",
"\t* measurement values\n",
"\t* timestamp or date information\n",
"\t* location or geographical coordinates\n",
"2. **Insert data into the collection**: Use Milvus's insert functionality to add air quality data to your collection. You can do this by passing a list of dictionaries, where each dictionary represents an air quality reading.\n",
"3. **Configure vector dimensions and distance calculation**: Since air quality data is typically not numerical vectors, you would need to transform your data into a suitable format for Milvus's similarity search capabilities. This might involve:\n",
"\t* Converting pollutant concentrations into numerical values (e.g., using a lookup table or mathematical functions)\n",
"\t* Creating a vector representation of each air quality reading\n",
"4. **Perform similarity searches**: Once your collection is populated, you can perform similarity searches to find air quality readings that are similar to a given query.\n",
"\n",
"Here's an example of how you could create a collection and insert data:\n",
"```python\n",
"import milvus\n",
"\n",
"# Create a collection with fields for pollutant names, measurement values, timestamp, and location\n",
"collection = milvus.create_collection(\n",
" name='air_quality',\n",
" dimension=10,\n",
" description='Air quality data'\n",
")\n",
"\n",
"# Insert air quality data into the collection\n",
"data = [\n",
" {'pollutant': 'PM2.5', 'value': 12, 'timestamp': '2023-03-01 00:00:00', 'location': [48.8566, 2.3522]},\n",
" {'pollutant': 'NO2', 'value': 25, 'timestamp': '2023-03-02 00:00:00', 'location': [48.8567, 2.3532]}\n",
"]\n",
"\n",
"collection.insert(data)\n",
"```\n",
"Please note that this is a simplified example and might need to be adjusted based on your specific use case.\n",
"\n",
"If you want to perform similarity searches on the inserted data, you would also need to configure Milvus to calculate distances using an appropriate distance metric (e.g., Euclidean, cosine similarity).\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/c2/6wq9klzn19d97gvw8f8j1jdr0000gn/T/ipykernel_4256/3227108844.py:2: DeprecationWarning: callback_manager is deprecated. Please use callbacks instead.\n",
" llm = Ollama(\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"\n",
"Search our articles: What Vector Embeddings should I use for images in Milvus?\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For images, you can use various vector embeddings such as:\n",
"\n",
"1. **VGG-Face**: A face recognition model that can be used to extract features from images.\n",
"2. **ImageNet**: A pre-trained CNN that can be fine-tuned for specific image classification tasks.\n",
"3. **ResNet-50**: A powerful CNN that can be used to extract features from images.\n",
"4. **MobileNetV2**: A lightweight CNN that can be used for efficient feature extraction.\n",
"\n",
"You can also use pre-trained models like:\n",
"\n",
"1. **EfficientNet-B0**: A family of CNNs that are optimized for image classification tasks.\n",
"2. **DenseNet-121**: A CNN that uses dense connections to improve feature extraction.\n",
"\n",
"When choosing a vector embedding, consider the following factors:\n",
"\n",
"* **Computational resources**: Choose an embedding that is computationally efficient and can be run on your available hardware.\n",
"* **Image size**: Select an embedding that can handle images of various sizes or resolutions.\n",
"* **Classification task**: Opt for an embedding that is specifically designed for your image classification task (e.g., object detection, facial recognition).\n",
"* **Pre-trained weights availability**: Use pre-trained models to save time and effort in training the model from scratch.\n",
"\n",
"Remember to fine-tune the pre-trained model on your specific dataset to improve performance and adapt the embeddings to your use case.For images, you can use various vector embeddings such as:\n",
"\n",
"1. **VGG-Face**: A face recognition model that can be used to extract features from images.\n",
"2. **ImageNet**: A pre-trained CNN that can be fine-tuned for specific image classification tasks.\n",
"3. **ResNet-50**: A powerful CNN that can be used to extract features from images.\n",
"4. **MobileNetV2**: A lightweight CNN that can be used for efficient feature extraction.\n",
"\n",
"You can also use pre-trained models like:\n",
"\n",
"1. **EfficientNet-B0**: A family of CNNs that are optimized for image classification tasks.\n",
"2. **DenseNet-121**: A CNN that uses dense connections to improve feature extraction.\n",
"\n",
"When choosing a vector embedding, consider the following factors:\n",
"\n",
"* **Computational resources**: Choose an embedding that is computationally efficient and can be run on your available hardware.\n",
"* **Image size**: Select an embedding that can handle images of various sizes or resolutions.\n",
"* **Classification task**: Opt for an embedding that is specifically designed for your image classification task (e.g., object detection, facial recognition).\n",
"* **Pre-trained weights availability**: Use pre-trained models to save time and effort in training the model from scratch.\n",
"\n",
"Remember to fine-tune the pre-trained model on your specific dataset to improve performance and adapt the embeddings to your use case.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/c2/6wq9klzn19d97gvw8f8j1jdr0000gn/T/ipykernel_4256/3227108844.py:2: DeprecationWarning: callback_manager is deprecated. Please use callbacks instead.\n",
" llm = Ollama(\n"
]
},
{
"name": "stdin",
"output_type": "stream",
"text": [
"\n",
"Search our articles: What do I need to build Intelligent AI Agents?\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"To build Intelligent AI Agents, you'll likely need several components:\n",
"\n",
"1. **Vector Database**: A database that can efficiently store and search vectors (numeric representations of data). Milvus Lite, a compact and efficient vector database, is specifically designed for local deployment in Python applications.\n",
"2. **Python Library**: The pymilvus library (version 2.4.3 or above) needs to be installed to incorporate Milvus Lite into your AI application.\n",
"3. **AI Development Stack Integration**: Consider integrating with popular frameworks and providers like LangChain, LlamaIndex, Haystack, Voyage AI, Ragas, Jina AI, DSPy, BentoML, WhyHow, Relari AI, Airbyte, HuggingFace, and MemGPT to simplify the development of AI applications.\n",
"4. **Prototyping to Production Environment**: Milvus Lite supports every stage of your development journey, from prototyping to production environments.\n",
"\n",
"By having these components in place, you can start building Intelligent AI Agents that leverage vector search capabilities.To build Intelligent AI Agents, you'll likely need several components:\n",
"\n",
"1. **Vector Database**: A database that can efficiently store and search vectors (numeric representations of data). Milvus Lite, a compact and efficient vector database, is specifically designed for local deployment in Python applications.\n",
"2. **Python Library**: The pymilvus library (version 2.4.3 or above) needs to be installed to incorporate Milvus Lite into your AI application.\n",
"3. **AI Development Stack Integration**: Consider integrating with popular frameworks and providers like LangChain, LlamaIndex, Haystack, Voyage AI, Ragas, Jina AI, DSPy, BentoML, WhyHow, Relari AI, Airbyte, HuggingFace, and MemGPT to simplify the development of AI applications.\n",
"4. **Prototyping to Production Environment**: Milvus Lite supports every stage of your development journey, from prototyping to production environments.\n",
"\n",
"By having these components in place, you can start building Intelligent AI Agents that leverage vector search capabilities.\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/c2/6wq9klzn19d97gvw8f8j1jdr0000gn/T/ipykernel_4256/3227108844.py:2: DeprecationWarning: callback_manager is deprecated. Please use callbacks instead.\n",
" llm = Ollama(\n"
]
}
],
"source": [
"while True:\n",
" run_query()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb52c44e-5555-4048-a5bd-c31ba512f166",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment