Skip to content

Instantly share code, notes, and snippets.

@uratmangun
Created December 28, 2024 14:19
Show Gist options
  • Select an option

  • Save uratmangun/55401b8699ed371b3ccef4a6f143d319 to your computer and use it in GitHub Desktop.

Select an option

Save uratmangun/55401b8699ed371b3ccef4a6f143d319 to your computer and use it in GitHub Desktop.

Revisions

  1. uratmangun.ovh created this gist Dec 28, 2024.
    182 changes: 182 additions & 0 deletions scrape-or-search.mjs
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,182 @@
    import { chromium } from 'playwright';
    import { fileURLToPath } from 'url';
    import dotenv from 'dotenv';
    import { searchParser } from './convert-search.mjs';

    dotenv.config({ path: '.env.local' });

    /**
    * Scrapes content from a given URL using Playwright with CDP connection
    * @param {string} url - The URL to scrape
    * @param {string} format - The output format ('html' or 'text')
    * @returns {Promise<Object>} - The scraped content
    */
    async function scrapeUrl(url, format = 'html') {
    if (!process.env.BRIGHT_PLAYWRIGHT_URL) {
    throw new Error('BRIGHT_PLAYWRIGHT_URL environment variable is not set');
    }

    const browser = await chromium.connectOverCDP(process.env.BRIGHT_PLAYWRIGHT_URL);

    try {
    const context = await browser.newContext();
    const page = await context.newPage();

    await page.goto(url, { timeout: 60000 });
    await page.waitForLoadState('networkidle');

    if (format === 'text') {
    const text = await page.textContent('body');
    return { text };
    } else {
    const html = await page.content();
    return { html };
    }
    } catch (error) {
    console.error('Error during scraping:', error);
    throw error;
    } finally {
    await browser.close();
    }
    }

    /**
    * Gets Google search results HTML
    * @param {string} query - Search query
    * @param {string} baseUrl - Base URL for Google search
    * @returns {Promise<string>} - HTML content
    */
    async function getGoogleHtml(query = '', baseUrl = 'https://www.google.com/search?q=') {
    const browser = await chromium.connectOverCDP(process.env.BRIGHT_PLAYWRIGHT_URL);
    const searchUrl = baseUrl + encodeURIComponent(query);

    try {
    const context = await browser.newContext();
    const page = await context.newPage();

    await page.goto(searchUrl);

    const bodyContent = await page.evaluate(() => {
    const body = document.body;
    if (!body) return '';

    // Remove all script tags
    const scripts = body.getElementsByTagName('script');
    while (scripts.length > 0) {
    scripts[0].parentNode.removeChild(scripts[0]);
    }

    // Remove all style tags
    const styles = body.getElementsByTagName('style');
    while (styles.length > 0) {
    styles[0].parentNode.removeChild(styles[0]);
    }

    // Remove all noscript tags
    const noscripts = body.getElementsByTagName('noscript');
    while (noscripts.length > 0) {
    noscripts[0].parentNode.removeChild(noscripts[0]);
    }

    return body.innerHTML;
    });

    return bodyContent;
    } catch (error) {
    console.error('Error fetching page:', error);
    throw error;
    } finally {
    await browser.close();
    }
    }

    /**
    * Shows the content of a given URL in specified format
    * @param {string} format - The output format ('html' or 'text')
    * @param {string} url - The URL to show content from
    * @returns {Promise<void>}
    */
    async function showContent(format, url) {
    try {
    const result = await scrapeUrl(url, format);
    console.log(result[format]);
    } catch (error) {
    console.error('Error showing content:', error);
    throw error;
    }
    }

    /**
    * Performs a Google search and shows the results
    * @param {string} query - Search query
    * @returns {Promise<void>}
    */
    async function search(query) {
    try {
    console.log('Getting google html');
    const html = await getGoogleHtml(query);
    console.log("parsing html");
    const parsed = await searchParser(html);
    console.log(parsed.choices[0].message.content);
    } catch (error) {
    console.error('Search error:', error);
    throw error;
    }
    }

    // Main function to handle command line usage
    async function main() {
    const [command, ...args] = process.argv.slice(2);

    if (!command) {
    console.error('Please provide a command.');
    console.error('Usage:');
    console.error(' pnpm run tool scrape <html|text> <url>');
    console.error(' pnpm run tool search <query>');
    process.exit(1);
    }

    try {
    switch (command.toLowerCase()) {
    case 'scrape':
    if (args.length < 2) {
    console.error('Please provide the format and URL to scrape.');
    console.error('Usage: pnpm run tool scrape <html|text> <url>');
    process.exit(1);
    }
    const format = args[0].toLowerCase();
    if (format !== 'html' && format !== 'text') {
    console.error('Invalid format. Use either "html" or "text".');
    process.exit(1);
    }
    await showContent(format, args[1]);
    break;

    case 'search':
    if (args.length < 1) {
    console.error('Please provide a search query.');
    console.error('Usage: pnpm run tool search <query>');
    process.exit(1);
    }
    await search(args.join(' '));
    break;

    default:
    console.error('Invalid command. Use either "scrape" or "search".');
    console.error('Usage:');
    console.error(' pnpm run tool scrape <html|text> <url>');
    console.error(' pnpm run tool search <query>');
    process.exit(1);
    }
    } catch (error) {
    console.error('Operation failed:', error);
    process.exit(1);
    }
    }

    // Run main function if this file is run directly
    if (process.argv[1] === fileURLToPath(import.meta.url)) {
    main();
    }

    export { scrapeUrl, getGoogleHtml };