Skip to content

Instantly share code, notes, and snippets.

@joaoescribano
Last active January 21, 2019 10:39
Show Gist options
  • Select an option

  • Save joaoescribano/b9c34e7a07fb450ae7747e582dd4e6ba to your computer and use it in GitHub Desktop.

Select an option

Save joaoescribano/b9c34e7a07fb450ae7747e582dd4e6ba to your computer and use it in GitHub Desktop.

Revisions

  1. joaoescribano created this gist Jan 21, 2019.
    141 changes: 141 additions & 0 deletions urlcapture.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,141 @@
    #!/usr/bin/env python3

    import datetime
    import math
    import os
    import sys
    import tempfile

    # third-party imports
    from PIL import Image
    from selenium import webdriver
    from time import sleep

    def get_chrome_drive(driver_path=None):
    base_dir = os.path.dirname( os.path.abspath(__file__) )
    log_path = os.path.join( base_dir, 'chromedriver.log' )

    if driver_path is None:
    driver_path = '/usr/bin/chromedriver'
    pass

    options = webdriver.ChromeOptions()
    options.headless = True
    options.add_argument('--hide-scrollbars')
    options.add_argument('--no-sandbox')

    driver = webdriver.Chrome(
    executable_path=driver_path,
    chrome_options=options,
    service_args=[
    # '--log-path={}'.format(log_path),
    # '--verbose',
    ]
    )

    return driver

    def get_firefox_drive(driver_path=None):
    base_dir = os.path.dirname( os.path.abspath(__file__) )
    log_path = os.path.join( base_dir, 'geckodriver.log' )

    if driver_path is None:
    driver_path = '/usr/bin/geckodriver'
    pass

    options = webdriver.FirefoxOptions()
    options.add_argument('-headless')

    driver = webdriver.Firefox(
    executable_path=driver_path,
    firefox_options=options
    )

    return driver

    def save_fullpage_screenshot(driver, url, output_path, tmp_prefix='selenium_screenshot', tmp_suffix='.png'):
    """
    Creates a full page screenshot using a selenium driver by scrolling and taking multiple screenshots,
    and stitching them into a single image.
    """

    # get the page
    driver.get(url)

    # get dimensions
    window_height = driver.execute_script('return window.innerHeight')
    scroll_height = driver.execute_script('return document.body.parentNode.scrollHeight')

    num = int( math.ceil( float(scroll_height) / float(window_height) ) )

    # get temp files
    tempfiles = []
    for i in range( num ):
    fd,path = tempfile.mkstemp(prefix='{0}-{1:02}-'.format(tmp_prefix, i+1), suffix=tmp_suffix)
    os.close(fd)
    tempfiles.append(path)
    pass
    tempfiles_len = len(tempfiles)

    try:
    # take screenshots
    for i,path in enumerate(tempfiles):
    if i > 0:
    driver.execute_script( 'window.scrollBy(%d,%d)' % (0, window_height) )
    driver.save_screenshot(path)
    pass

    # stitch images together
    stiched = None
    for i,path in enumerate(tempfiles):
    img = Image.open(path)

    w, h = img.size
    y = i * window_height

    if i == ( tempfiles_len - 1 ) and num > 1:
    img = img.crop((
    0,
    h-(scroll_height % h),
    w,
    h
    ))

    w, h = img.size
    pass

    if stiched is None:
    stiched = Image.new('RGB', (w, scroll_height))

    stiched.paste(img, (
    0, # x0
    y, # y0
    w, # x1
    y + h # y1
    ))
    pass
    stiched.save(output_path)
    finally:
    # cleanup
    for path in tempfiles:
    if os.path.isfile(path):
    os.remove(path)
    pass

    return output_path


    def main():
    url = sys.argv[1]
    filename = sys.argv[2]

    driver = get_chrome_drive()
    driver.set_window_size(1280,768)

    save_fullpage_screenshot(driver, url, filename)
    driver.quit()

    return

    if __name__ == '__main__':
    main()