Skip to content

Instantly share code, notes, and snippets.

@simonhearne
Created May 5, 2026 16:32
Show Gist options
  • Select an option

  • Save simonhearne/665dfdfc3cb6eca12353c08008a9ff41 to your computer and use it in GitHub Desktop.

Select an option

Save simonhearne/665dfdfc3cb6eca12353c08008a9ff41 to your computer and use it in GitHub Desktop.
### milvus tally operator
### finds unique field values, returns the count of entites for each
from collections import Counter
from pymilvus import MilvusClient
### variables to set
collection_name = "embeddings" # match the collection name
field = "field_to_tally" # match the field to count
client = MilvusClient(uri="cluster-url",token="read-only-token")
client.load_collection(collection_name)
counts = Counter()
iterator = client.query_iterator(
collection_name=collection_name,
filter=f"{field} != ''", # scalar filter — no vector needed
output_fields=[field], # only pull the field we care about
batch_size=1000,
)
while True:
batch = iterator.next()
if not batch:
iterator.close()
break
counts.update(row[field] for row in batch)
print(f"\nDistinct keys: {len(counts)}")
print(f"Total entities: {sum(counts.values())}")
# Top 10 distinct keys by count
print("\nTop ten distinct keys:")
for key, n in counts.most_common(10):
print(f" - {key}: {n}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment