Skip to content

Instantly share code, notes, and snippets.

@talolard
Created March 27, 2026 10:14
Show Gist options
  • Select an option

  • Save talolard/6d3aa0d7b41c8030c961f898d8edab96 to your computer and use it in GitHub Desktop.

Select an option

Save talolard/6d3aa0d7b41c8030c961f898d8edab96 to your computer and use it in GitHub Desktop.
Agent-based lead generation example: report and Playwright extractor
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import re
from dataclasses import asdict, dataclass
from typing import Any, Iterable, Mapping, Sequence
from playwright.async_api import async_playwright
SEARCH_URL = "https://steuerberaterverzeichnis.berufs-org.de/?lang=de"
DETAILS_LINK_SELECTOR = "a[href^='details/']"
LOGGER = logging.getLogger(__name__)
@dataclass(frozen=True)
class Lead:
name_lines: tuple[str, ...]
address_lines: tuple[str, ...]
details_url: str
def format_line(self) -> str:
name = " ".join(self.name_lines).strip()
address = ", ".join(self.address_lines).strip()
return f"{name} | {address} | {self.details_url}"
@dataclass(frozen=True)
class SummarySection:
identity_lines: tuple[str, ...]
legal_form: str | None
address_lines: tuple[str, ...]
contacts: "ContactsSection"
safe_id: str | None
@dataclass(frozen=True)
class ContactsSection:
phone: str | None
mobile: str | None
fax: str | None
email: str | None
website: str | None
@dataclass(frozen=True)
class NoteEntry:
label: str
value: str
@dataclass(frozen=True)
class NotesSection:
entries: tuple[NoteEntry, ...]
@dataclass(frozen=True)
class ChamberSection:
name: str
address_lines: tuple[str, ...]
@dataclass(frozen=True)
class AdditionalOfficesSection:
offices: tuple[str, ...]
@dataclass(frozen=True)
class ShareholdersSection:
members: tuple[str, ...]
@dataclass(frozen=True)
class RegisterSection:
entries: tuple[str, ...]
@dataclass(frozen=True)
class RepresentativeGroup:
title: str
members: tuple[str, ...]
@dataclass(frozen=True)
class RepresentativesSection:
groups: tuple[RepresentativeGroup, ...]
@dataclass(frozen=True)
class LeadDetails:
details_url: str
summary: SummarySection
notes: NotesSection | None
chamber: ChamberSection | None
additional_offices: AdditionalOfficesSection | None
shareholders: ShareholdersSection | None
register: RegisterSection | None
representatives: RepresentativesSection | None
def _split_card_lines(lines: Iterable[str]) -> tuple[tuple[str, ...], tuple[str, ...]]:
"""Split card text into name and address groups using a postal-code boundary."""
name_lines: list[str] = []
address_lines: list[str] = []
postal_seen = False
postal_re = re.compile(r"\b\d{5}\b")
for line in lines:
cleaned = line.strip()
if not cleaned:
continue
if postal_re.search(cleaned):
postal_seen = True
if postal_seen:
address_lines.append(cleaned)
else:
name_lines.append(cleaned)
return tuple(name_lines), tuple(address_lines)
def _normalize_whitespace(value: str) -> str:
return " ".join(value.split()).strip()
def _split_address_line(value: str) -> tuple[str, ...]:
match = re.search(r"\b\d{5}\s+\S.*", value)
if not match:
return (value.strip(),)
return (value[: match.start()].strip(), value[match.start() :].strip())
def _extract_labeled_value(
text: str,
label: str,
) -> str | None:
pattern = re.compile(rf"^{re.escape(label)}\s*:\s*(.+)$", re.IGNORECASE)
match = pattern.match(_normalize_whitespace(text))
if not match:
return None
return match.group(1).strip()
def _lead_details_from_payload(
details_url: str, payload: Mapping[str, Any]
) -> LeadDetails:
summary_paragraphs = list(payload.get("summary_paragraphs", []))
identity_lines: tuple[str, ...] = tuple()
legal_form: str | None = None
address_lines: tuple[str, ...] = tuple()
phone: str | None = None
mobile: str | None = None
fax: str | None = None
email: str | None = None
website: str | None = None
safe_id: str | None = None
collecting_names = True
for paragraph in summary_paragraphs:
normalized = _normalize_whitespace(paragraph)
if (
collecting_names
and ":" not in normalized
and not re.search(r"\b\d{5}\b", normalized)
):
identity_lines += tuple(
part for part in paragraph.splitlines() if part.strip()
)
continue
legal_value = _extract_labeled_value(paragraph, "Rechtsform")
if legal_value:
legal_form = legal_value
collecting_names = False
continue
phone_value = _extract_labeled_value(paragraph, "Telefon")
if phone_value:
phone = phone_value
collecting_names = False
continue
mobile_value = _extract_labeled_value(paragraph, "Mobil")
if mobile_value:
mobile = mobile_value
collecting_names = False
continue
fax_value = _extract_labeled_value(paragraph, "Telefax")
if fax_value:
fax = fax_value
collecting_names = False
continue
email_value = _extract_labeled_value(paragraph, "E-Mail")
if email_value:
email = email_value
collecting_names = False
continue
website_value = _extract_labeled_value(paragraph, "Internet")
if website_value:
website = website_value
collecting_names = False
continue
safe_id_value = _extract_labeled_value(paragraph, "Safe ID")
if safe_id_value:
safe_id = safe_id_value
collecting_names = False
continue
if re.search(r"\b\d{5}\b", normalized) and not address_lines:
address_lines = _split_address_line(normalized)
collecting_names = False
notes_entries: list[NoteEntry] = []
for line in payload.get("notes_lines", []):
normalized = _normalize_whitespace(line)
if ":" in normalized:
label, value = normalized.split(":", 1)
notes_entries.append(NoteEntry(label=label.strip(), value=value.strip()))
notes_section = NotesSection(tuple(notes_entries)) if notes_entries else None
chamber_lines = [_normalize_whitespace(l) for l in payload.get("chamber_lines", [])]
chamber_section: ChamberSection | None = None
additional_offices: AdditionalOfficesSection | None = None
shareholders: ShareholdersSection | None = None
register_section: RegisterSection | None = None
if chamber_lines:
chamber_name = chamber_lines[0]
remaining = list(chamber_lines[1:])
chamber_address: list[str] = []
offices: list[str] = []
shareholders_entries: list[str] = []
register_entries: list[str] = []
mode = "address"
for line in remaining:
if line == "Weitere Beratungsstelle(n)":
mode = "offices"
continue
if line == "Gesellschafter":
mode = "shareholders"
continue
if line.startswith("Register"):
mode = "register"
register_entries.append(line)
continue
if line == "Alle ausblenden":
continue
if mode == "address":
chamber_address.append(line)
elif mode == "offices":
offices.append(line)
elif mode == "shareholders":
shareholders_entries.append(line)
elif mode == "register":
register_entries.append(line)
chamber_section = ChamberSection(
name=chamber_name, address_lines=tuple(chamber_address)
)
if offices:
additional_offices = AdditionalOfficesSection(tuple(offices))
if shareholders_entries:
shareholders = ShareholdersSection(tuple(shareholders_entries))
if register_entries:
register_section = RegisterSection(tuple(register_entries))
representative_groups: list[RepresentativeGroup] = []
for group in payload.get("representatives", []):
title = _normalize_whitespace(group.get("title", ""))
members = tuple(
_normalize_whitespace(member)
for member in group.get("members", [])
if _normalize_whitespace(member)
)
if title or members:
representative_groups.append(
RepresentativeGroup(title=title, members=members)
)
representatives_section = (
RepresentativesSection(tuple(representative_groups))
if representative_groups
else None
)
contacts = ContactsSection(
phone=phone,
mobile=mobile,
fax=fax,
email=email,
website=website,
)
summary = SummarySection(
identity_lines=identity_lines,
legal_form=legal_form,
address_lines=address_lines,
contacts=contacts,
safe_id=safe_id,
)
return LeadDetails(
details_url=details_url,
summary=summary,
notes=notes_section,
chamber=chamber_section,
additional_offices=additional_offices,
shareholders=shareholders,
register=register_section,
representatives=representatives_section,
)
async def _extract_detail_payload(page: Any) -> dict[str, Any]:
"""Return structured text blocks for the detail page so Python can map them."""
return await page.evaluate(
"""
() => {
const heading = (title) =>
Array.from(document.querySelectorAll('h2')).find(
(h) => h.textContent && h.textContent.trim() === title
);
const collectSectionLines = (title) => {
const h2 = heading(title);
if (!h2) return [];
const lines = [];
let el = h2.nextElementSibling;
while (el && el.tagName !== 'H2') {
if (el.innerText) {
const split = el.innerText
.split('\\n')
.map((line) => line.trim())
.filter(Boolean);
lines.push(...split);
}
el = el.nextElementSibling;
}
return lines;
};
const summaryParagraphs = () => {
const firstH2 = document.querySelector('h2');
if (!firstH2) {
return Array.from(document.querySelectorAll('p'))
.map((p) => p.innerText)
.filter(Boolean);
}
return Array.from(document.querySelectorAll('p'))
.filter((p) => {
return (
p.compareDocumentPosition(firstH2) &
Node.DOCUMENT_POSITION_FOLLOWING
);
})
.map((p) => p.innerText)
.filter(Boolean);
};
const representatives = () => {
const h2 = heading('Vertreter');
const container = h2 ? h2.nextElementSibling : null;
if (!container) return [];
const groups = [];
let currentTitle = '';
for (const child of Array.from(container.children)) {
if (child.tagName === 'SPAN') {
currentTitle = child.textContent ? child.textContent.trim() : '';
continue;
}
if (child.tagName === 'UL') {
const members = Array.from(child.querySelectorAll('li'))
.map((li) => (li.textContent || '').trim())
.filter(Boolean);
if (currentTitle || members.length) {
groups.push({ title: currentTitle, members });
}
}
}
return groups;
};
return {
summary_paragraphs: summaryParagraphs(),
notes_lines: collectSectionLines('Hinweise'),
chamber_lines: collectSectionLines('Zuständige Steuerberaterkammer'),
representatives: representatives(),
};
}
"""
)
async def search_zip(zip_code: str) -> list[Lead]:
"""Search the registry by ZIP code and return the visible result cards."""
async with async_playwright() as playwright:
browser = await playwright.chromium.launch(headless=True)
page = await browser.new_page()
LOGGER.info("Navigating to search page.")
await page.goto(SEARCH_URL, wait_until="domcontentloaded")
LOGGER.info("Filling ZIP code %s.", zip_code)
await page.fill("#plz-text", zip_code)
LOGGER.info("Submitting search.")
await page.click("input.verzeichnis-btn.my-3")
LOGGER.info("Waiting for results header.")
await page.wait_for_selector("text=Treffer:")
LOGGER.info("Waiting for results to settle.")
await page.wait_for_timeout(500)
cards = (
page.locator(DETAILS_LINK_SELECTOR)
.filter(has=page.locator("img[alt='next']"))
)
count = await cards.count()
LOGGER.info("Found %s result cards.", count)
leads: list[Lead] = []
for idx in range(count):
card = cards.nth(idx)
details_url = await card.get_attribute("href") or ""
if details_url.startswith("details/"):
details_url = f"https://steuerberaterverzeichnis.berufs-org.de/{details_url}"
text = await card.inner_text()
name_lines, address_lines = _split_card_lines(text.splitlines())
leads.append(
Lead(
name_lines=name_lines,
address_lines=address_lines,
details_url=details_url,
)
)
await browser.close()
return leads
async def fetch_details(zip_code: str) -> list[LeadDetails]:
"""Search by ZIP code, follow each result detail page, and parse sections."""
async with async_playwright() as playwright:
browser = await playwright.chromium.launch(headless=True)
page = await browser.new_page()
LOGGER.info("Navigating to search page.")
await page.goto(SEARCH_URL, wait_until="domcontentloaded")
LOGGER.info("Filling ZIP code %s.", zip_code)
await page.fill("#plz-text", zip_code)
LOGGER.info("Submitting search.")
await page.click("input.verzeichnis-btn.my-3")
LOGGER.info("Waiting for results header.")
await page.wait_for_selector("text=Treffer:")
LOGGER.info("Waiting for results to settle.")
await page.wait_for_timeout(500)
cards = (
page.locator(DETAILS_LINK_SELECTOR)
.filter(has=page.locator("img[alt='next']"))
)
count = await cards.count()
LOGGER.info("Found %s result cards.", count)
detail_urls: list[str] = []
for idx in range(count):
card = cards.nth(idx)
details_url = await card.get_attribute("href") or ""
if details_url.startswith("details/"):
details_url = f"https://steuerberaterverzeichnis.berufs-org.de/{details_url}"
if details_url:
detail_urls.append(details_url)
results: list[LeadDetails] = []
for idx, details_url in enumerate(detail_urls, start=1):
LOGGER.info("Fetching details %s/%s.", idx, len(detail_urls))
detail_page = await browser.new_page()
payload: dict[str, Any] = {}
for _attempt in range(2):
LOGGER.info("Navigating to detail page.")
response = await detail_page.goto(
details_url, wait_until="domcontentloaded"
)
if response is not None and response.status >= 400:
LOGGER.warning(
"Detail page returned %s, retrying.", response.status
)
LOGGER.info("Waiting before retry.")
await detail_page.wait_for_timeout(1000)
continue
try:
LOGGER.info("Waiting for Safe ID marker.")
await detail_page.wait_for_selector("text=Safe ID", timeout=5000)
except Exception:
LOGGER.info("Safe ID not found, waiting for section header.")
try:
await detail_page.wait_for_selector("h2", timeout=3000)
except Exception:
pass
LOGGER.info("Waiting for detail content to settle.")
await detail_page.wait_for_timeout(500)
LOGGER.info("Extracting detail payload.")
payload = await _extract_detail_payload(detail_page)
if payload.get("summary_paragraphs"):
break
LOGGER.info("Detail payload empty, retrying.")
await detail_page.wait_for_timeout(1000)
await detail_page.close()
results.append(_lead_details_from_payload(details_url, payload))
LOGGER.info("Waiting between detail requests.")
await page.wait_for_timeout(1000)
await browser.close()
return results
async def _main() -> None:
parser = argparse.ArgumentParser(
description=(
"Search the Steuerberaterverzeichnis by ZIP code using Playwright "
"and print the result cards."
)
)
parser.add_argument("--zip", dest="zip_code", required=True)
parser.add_argument(
"--details",
action="store_true",
help="Follow each detail page and print structured section data as JSON.",
)
args = parser.parse_args()
if args.details:
details = await fetch_details(args.zip_code)
for item in details:
print(json.dumps(asdict(item), ensure_ascii=True))
else:
leads = await search_zip(args.zip_code)
for lead in leads:
print(lead.format_line())
def main() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
asyncio.run(_main())
if __name__ == "__main__":
main()

Using Agents for Lead Generation: A Concrete Example

This is a small real example of using an agent to build a lead-generation workflow from a public directory.

The session I traced was:

  • Session id: 019beb75-e471-7501-bff9-75e772c988f7
  • Date: 2026-01-23
  • Repo root at the time: /Users/tal/dev/tax-leads

The Prompt I Gave

Here is the initial prompt exactly as given:

Write a script that uses Playwright to search for everyone in zip code https://steuerberaterverzeichnis.berufs-org.de/?lang=de  on element

<input autocomplete="off" class="form-control" id="plz-text" placeholder="Postcode" aria-label="Postcode" type="text" maxlength="1024" name="plzFilter" value="">

The search button is <input type="submit" value="Search" class="verzeichnis-btn my-3"> .

Use the playwright mcp

Why This Prompt Worked

What makes this a good agent prompt is that it is concrete. It does not just say "get me leads." It gives the agent enough operational detail to act.

1. It names the source

The prompt points at a specific public directory:

  • https://steuerberaterverzeichnis.berufs-org.de/?lang=de

That immediately narrows the task from "find leads on the internet" to "extract leads from this known source."

2. It gives the interaction primitive

It specifies the actual input element:

  • #plz-text

and the actual search button:

  • input.verzeichnis-btn.my-3

This is important because browser agents are much more reliable when you give them page-level anchors instead of vague instructions like "find the postcode field."

3. It defines the search scope

The prompt says:

  • search for everyone in a zip code

That gives the workflow a clear unit of work. Instead of scraping the whole registry, the agent can operate per postcode. That is a practical lead-generation pattern because it is easy to batch, retry, and validate.

4. It chooses the tool

The prompt explicitly says:

  • use the Playwright MCP

That tells the agent not just what to do, but how to do it. In practice this matters a lot. The agent does not have to guess between raw HTTP, BeautifulSoup, Selenium, or browser automation. It can go straight to the right execution model.

What the Agent Actually Built

The session created:

It started as a ZIP-code search script and then got expanded through follow-up prompts into a richer extractor.

By the end, the script was not just collecting a list of names. It was following each result page and mapping the public profile into structured Python dataclasses.

What Kind of Leads This Would Generate

This workflow generates a very specific kind of lead:

  • tax advisors and accounting-related firms listed in the German public tax advisor directory
  • geographically scoped by postcode
  • pulled from a regulated public registry rather than from generic web scraping

That means the leads are not random B2B prospects. They are directory-listed professional contacts in a known category.

In practical terms, the output can include:

  • identity or firm name
  • street and postal address
  • detail page URL
  • legal form
  • phone
  • mobile
  • fax
  • email
  • website
  • Safe ID
  • chamber information
  • additional offices
  • shareholders
  • register entries
  • representatives

So this is useful if someone wants:

  • local professional-services leads by geography
  • a structured contact list for outreach research
  • a way to build a niche directory dataset
  • a repeatable collector for one postcode at a time

It would be especially good for:

  • local sales prospecting
  • market mapping
  • territory research
  • enrichment pipelines where public directory data is one input among several

What This Prompt Does Not Do

It is worth being clear about the boundary.

This prompt does not ask the agent to:

  • decide who is a good lead
  • score or rank the leads
  • deduplicate against a CRM
  • enrich them with company size, revenue, tech stack, or buying intent
  • write outreach copy

It is a lead collection prompt, not a full go-to-market system prompt.

That distinction is useful when talking to someone about agents. A good agent workflow usually starts with a narrow, reliable primitive like "collect structured leads from this source," and only later adds ranking, enrichment, filtering, or messaging.

Was There Back-and-Forth?

Yes. There was real iteration, but it was fairly focused.

After the initial prompt, the conversation added a few concrete follow-ups:

Run it on zipcode 13187, each result url is a hex encoded uuid like https://steuerberaterverzeichnis.berufs-org.de/details/F4-06-A7-F9-E4-39-97-4C-6C-0C-3D-F9-47-5B-5E-5D/?lang=de . Follow each of them . Define datclasses for each of the sections, and a parent dataclass, I'd like field names in English .
Add logging at major steps and before any timeout so I know where we are when I run this.
ok great . Now that i see the output, I want you to follow each of the links and popiulate the objects in a structured manner

There was also a one-character message:

2

That does not appear to carry meaningful task content.

What the back-and-forth tells us

This was not a long strategy conversation. It was a short implementation loop:

  1. Build the browser search script.
  2. Run it on a real postcode.
  3. Follow detail pages.
  4. Convert the results into structured typed objects.
  5. Add logging so the workflow is easier to trust and debug.

That is a good example of where agents shine. The initial prompt got the first version built quickly, and the short follow-ups shaped it into something operationally useful.

A Simple Way to Explain This to a Friend

If I were explaining this casually, I would say:

I used an agent to turn a public professional directory into a postcode-based lead collector. The prompt gave the site, the exact form field, the submit button, and the tool to use. Then I iterated a bit so it did more than scrape a list: it followed each profile and turned the public data into a structured contact record.

And if they ask what is special about the agent part, I would say:

The agent was useful because it could inspect the page, write the automation, run it, notice what the result pages looked like, and then update the code to produce cleaner structured output. It was not "magic lead gen." It was fast execution on a tightly scoped scraping task.

The Main Lesson

The strongest agent prompts for lead generation usually contain four things:

  • a clear source
  • a clear unit of search, like postcode or category
  • concrete page selectors or interaction hints
  • a clear desired output shape

That is exactly what happened here. The prompt was not fancy, but it was specific enough for the agent to do real work.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment