joaoescribano · January 21, 2019 10:39 · Jan 21, 2019
diff --git a/urlcapture.py b/urlcapture.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+
+import datetime
+import math
+import os
+import sys
+import tempfile
+
+# third-party imports
+from PIL import Image
+from selenium import webdriver
+from time import sleep
+
+def get_chrome_drive(driver_path=None):
+    base_dir = os.path.dirname( os.path.abspath(__file__) )
+    log_path = os.path.join( base_dir, 'chromedriver.log' )
+
+    if driver_path is None:
+        driver_path = '/usr/bin/chromedriver'
+        pass
+
+    options = webdriver.ChromeOptions()
+    options.headless = True
+    options.add_argument('--hide-scrollbars')
+    options.add_argument('--no-sandbox')
+
+    driver = webdriver.Chrome(
+        executable_path=driver_path,
+        chrome_options=options,
+        service_args=[
+            # '--log-path={}'.format(log_path),
+            # '--verbose',
+        ]
+    )
+
+    return driver
+
+def get_firefox_drive(driver_path=None):
+    base_dir = os.path.dirname( os.path.abspath(__file__) )
+    log_path = os.path.join( base_dir, 'geckodriver.log' )
+
+    if driver_path is None:
+        driver_path = '/usr/bin/geckodriver'
+        pass
+
+    options = webdriver.FirefoxOptions()
+    options.add_argument('-headless')
+
+    driver = webdriver.Firefox(
+        executable_path=driver_path,
+        firefox_options=options
+    )
+
+    return driver
+
+def save_fullpage_screenshot(driver, url, output_path, tmp_prefix='selenium_screenshot', tmp_suffix='.png'):
+    """
+    Creates a full page screenshot using a selenium driver by scrolling and taking multiple screenshots,
+    and stitching them into a single image.
+    """
+
+    # get the page
+    driver.get(url)
+
+    # get dimensions
+    window_height = driver.execute_script('return window.innerHeight')
+    scroll_height = driver.execute_script('return document.body.parentNode.scrollHeight')
+
+    num = int( math.ceil( float(scroll_height) / float(window_height) ) )
+
+    # get temp files
+    tempfiles = []
+    for i in range( num ):
+        fd,path = tempfile.mkstemp(prefix='{0}-{1:02}-'.format(tmp_prefix, i+1), suffix=tmp_suffix)
+        os.close(fd)
+        tempfiles.append(path)
+        pass
+    tempfiles_len = len(tempfiles)
+
+    try:
+        # take screenshots
+        for i,path in enumerate(tempfiles):
+            if i > 0:
+                driver.execute_script( 'window.scrollBy(%d,%d)' % (0, window_height) )
+            driver.save_screenshot(path)
+            pass
+
+        # stitch images together
+        stiched = None
+        for i,path in enumerate(tempfiles):
+            img = Image.open(path)
+
+            w, h = img.size
+            y = i * window_height
+
+            if i == ( tempfiles_len - 1 ) and num > 1:
+                img = img.crop((
+                    0,
+                    h-(scroll_height % h),
+                    w,
+                    h
+                ))
+
+                w, h = img.size
+                pass
+
+            if stiched is None:
+                stiched = Image.new('RGB', (w, scroll_height))
+
+            stiched.paste(img, (
+                0, # x0
+                y, # y0
+                w, # x1
+                y + h # y1
+            ))
+            pass
+        stiched.save(output_path)
+    finally:
+        # cleanup
+        for path in tempfiles:
+            if os.path.isfile(path):
+                os.remove(path)
+        pass
+
+    return output_path
+
+
+def main():
+    url = sys.argv[1]
+    filename = sys.argv[2]
+
+    driver = get_chrome_drive()
+    driver.set_window_size(1280,768)
+
+    save_fullpage_screenshot(driver, url, filename)
+    driver.quit()
+
+    return
+
+if __name__ == '__main__':
+    main()
No results found