ch12

2021-10-28 17:40:56 +05:30 · 2021-10-28 17:40:56 +05:30 · d4c7ab6c61
commit d4c7ab6c61
parent 3cebc2fe91
11 changed files with 320 additions and 0 deletions
--- a/ch12/guiscrape.py
+++ b/ch12/guiscrape.py
@ -0,0 +1,174 @@
+# guiscrape.py
+from tkinter import *
+from tkinter import ttk, filedialog, messagebox
+import base64
+import json
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+import requests
+
+
+config = {}
+
+
+def fetch_url():
+    url = _url.get()
+    config['images'] = []
+    _images.set(())  # initialised as an empty tuple
+    try:
+        page = requests.get(url)
+    except requests.RequestException as err:
+        sb(str(err))
+    else:
+        soup = BeautifulSoup(page.content, 'html.parser')
+        images = fetch_images(soup, url)
+        if images:
+            _images.set(tuple(img['name'] for img in images))
+            sb('Images found: {}'.format(len(images)))
+        else:
+            sb('No images found')
+        config['images'] = images
+
+
+def fetch_images(soup, base_url):
+    images = []
+    for img in soup.findAll('img'):
+        src = img.get('src')
+        img_url = f'{base_url}/{src}'
+        name = img_url.split('/')[-1]
+        images.append(dict(name=name, url=img_url))
+    return images
+
+
+def save():
+    if not config.get('images'):
+        alert('No images to save')
+        return
+
+    if _save_method.get() == 'img':
+        dirname = filedialog.askdirectory(mustexist=True)
+        save_images(dirname)
+    else:
+        filename = filedialog.asksaveasfilename(
+            initialfile='images.json',
+            filetypes=[('JSON', '.json')])
+        save_json(filename)
+
+
+def save_images(dirname):
+    if dirname and config.get('images'):
+        for img in config['images']:
+            img_data = requests.get(img['url']).content
+            filename = Path(dirname).joinpath(img['name'])
+            with open(filename, 'wb') as f:
+                f.write(img_data)
+        alert('Done')
+
+
+def save_json(filename):
+    if filename and config.get('images'):
+        data = {}
+        for img in config['images']:
+            img_data = requests.get(img['url']).content
+            b64_img_data = base64.b64encode(img_data)
+            str_img_data = b64_img_data.decode('utf-8')
+            data[img['name']] = str_img_data
+
+        with open(filename, 'w') as ijson:
+            ijson.write(json.dumps(data))
+        alert('Done')
+
+
+def sb(msg):
+    _status_msg.set(msg)
+
+
+def alert(msg):
+    messagebox.showinfo(message=msg)
+
+
+if __name__ == "__main__":
+
+    _root = Tk()
+    _root.title('Scrape app')
+
+    _mainframe = ttk.Frame(_root, padding='5 5 5 5')
+    _mainframe.grid(row=0, column=0, sticky=(E, W, N, S))
+
+    _url_frame = ttk.LabelFrame(
+        _mainframe, text='URL', padding='5 5 5 5')
+    _url_frame.grid(row=0, column=0, sticky=(E, W))
+    _url_frame.columnconfigure(0, weight=1)
+    _url_frame.rowconfigure(0, weight=1)
+
+    _url = StringVar()
+    _url.set('http://localhost:8000')
+    _url_entry = ttk.Entry(
+        _url_frame, width=40, textvariable=_url)
+    _url_entry.grid(row=0, column=0, sticky=(E, W, S, N), padx=5)
+
+    _fetch_btn = ttk.Button(
+        _url_frame, text='Fetch info', command=fetch_url)
+    _fetch_btn.grid(row=0, column=1, sticky=W, padx=5)
+
+    _img_frame = ttk.LabelFrame(
+        _mainframe, text='Content', padding='9 0 0 0')
+    _img_frame.grid(row=1, column=0, sticky=(N, S, E, W))
+
+    _images = StringVar()
+    _img_listbox = Listbox(
+        _img_frame, listvariable=_images, height=6, width=25)
+    _img_listbox.grid(row=0, column=0, sticky=(E, W), pady=5)
+    _scrollbar = ttk.Scrollbar(
+        _img_frame, orient=VERTICAL, command=_img_listbox.yview)
+    _scrollbar.grid(row=0, column=1, sticky=(S, N), pady=6)
+    _img_listbox.configure(yscrollcommand=_scrollbar.set)
+
+    _radio_frame = ttk.Frame(_img_frame)
+    _radio_frame.grid(row=0, column=2, sticky=(N, S, W, E))
+
+    _choice_lbl = ttk.Label(
+        _radio_frame, text="Choose how to save images")
+    _choice_lbl.grid(row=0, column=0, padx=5, pady=5)
+
+    _save_method = StringVar()
+    _save_method.set('img')
+    _img_only_radio = ttk.Radiobutton(
+        _radio_frame, text='As Images', variable=_save_method,
+        value='img')
+    _img_only_radio.grid(
+        row=1, column=0, padx=5, pady=2, sticky=W)
+    _img_only_radio.configure(state='normal')
+    _json_radio = ttk.Radiobutton(
+        _radio_frame, text='As JSON', variable=_save_method,
+        value='json')
+    _json_radio.grid(row=2, column=0, padx=5, pady=2, sticky=W)
+
+    _scrape_btn = ttk.Button(
+        _mainframe, text='Scrape!', command=save)
+    _scrape_btn.grid(row=2, column=0, sticky=E, pady=5)
+
+    _status_frame = ttk.Frame(
+        _root, relief='sunken', padding='2 2 2 2')
+    _status_frame.grid(row=1, column=0, sticky=(E, W, S))
+
+    _status_msg = StringVar()
+    _status_msg.set('Type a URL to start scraping...')
+    _status = ttk.Label(
+        _status_frame, textvariable=_status_msg, anchor=W)
+    _status.grid(row=0, column=0, sticky=(E, W))
+
+    _root.mainloop()
+
+
+"""
+Example on reading a JSON file:
+
+with open('images.json', 'r') as f:
+    data = json.loads(f.read())
+
+for (name, b64val) in data.items():
+    with open(name, 'wb') as f:
+        f.write(base64.b64decode(b64val))
+"""
--- a/ch12/requirements/requirements.in
+++ b/ch12/requirements/requirements.in
@ -0,0 +1,2 @@
+beautifulsoup4
+requests
--- a/ch12/requirements/requirements.txt
+++ b/ch12/requirements/requirements.txt
@ -0,0 +1,20 @@
+#
+# This file is autogenerated by pip-compile with python 3.9
+# To update, run:
+#
+#    pip-compile requirements.in
+#
+beautifulsoup4==4.9.3
+    # via -r requirements.in
+certifi==2021.5.30
+    # via requests
+charset-normalizer==2.0.3
+    # via requests
+idna==3.2
+    # via requests
+requests==2.26.0
+    # via -r requirements.in
+soupsieve==2.2.1
+    # via beautifulsoup4
+urllib3==1.26.6
+    # via requests
--- a/ch12/scrape.py
+++ b/ch12/scrape.py
@ -0,0 +1,105 @@
+# scrape.py
+import argparse
+import base64
+import json
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+import requests
+
+
+def scrape(url, format_, type_):
+    try:
+        page = requests.get(url)
+    except requests.RequestException as err:
+        print(str(err))
+    else:
+        soup = BeautifulSoup(page.content, 'html.parser')
+        images = fetch_images(soup, url)
+        images = filter_images(images, type_)
+        save(images, format_)
+
+
+def fetch_images(soup, base_url):
+    # Works only with relative src paths.
+    images = []
+    for img in soup.findAll('img'):
+        src = img.get('src')
+        img_url = f'{base_url}/{src}'
+        name = img_url.split('/')[-1]
+        images.append(dict(name=name, url=img_url))
+    return images
+
+
+def filter_images(images, type_):
+    if type_ == 'all':
+        return images
+    ext_map = {
+        'png': ['.png'],
+        'jpg': ['.jpg', '.jpeg'],
+    }
+    return [
+        img for img in images
+        if matches_extension(img['name'], ext_map[type_])
+    ]
+
+
+def matches_extension(filename, extension_list):
+    extension = Path(filename.lower()).suffix
+    return extension in extension_list
+
+
+def save(images, format_):
+    if images:
+        if format_ == 'img':
+            save_images(images)
+        else:
+            save_json(images)
+        print('Done')
+    else:
+        print('No images to save.')
+
+
+def save_images(images):
+    for img in images:
+        img_data = requests.get(img['url']).content
+        with open(img['name'], 'wb') as f:
+            f.write(img_data)
+
+
+def save_json(images):
+    data = {}
+    for img in images:
+        img_data = requests.get(img['url']).content
+        b64_img_data = base64.b64encode(img_data)
+        str_img_data = b64_img_data.decode('utf-8')
+        data[img['name']] = str_img_data
+
+    with open('images.json', 'w') as ijson:
+        ijson.write(json.dumps(data))
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(
+        description='Scrape a webpage.')
+    parser.add_argument(
+        '-t',
+        '--type',
+        choices=['all', 'png', 'jpg'],
+        default='all',
+        help='The image type we want to scrape.')
+
+    parser.add_argument(
+        '-f',
+        '--format',
+        choices=['img', 'json'],
+        default='img',
+        help='The format images are saved to.')
+
+    parser.add_argument(
+        'url',
+        help='The URL we want to scrape for images.')
+
+    args = parser.parse_args()
+    scrape(args.url, args.format, args.type)
--- a/ch12/simple_server/img/owl-alcohol.png
+++ b/ch12/simple_server/img/owl-alcohol.png
--- a/ch12/simple_server/img/owl-book.png
+++ b/ch12/simple_server/img/owl-book.png
--- a/ch12/simple_server/img/owl-books.png
+++ b/ch12/simple_server/img/owl-books.png
--- a/ch12/simple_server/img/owl-ebook.jpg
+++ b/ch12/simple_server/img/owl-ebook.jpg
--- a/ch12/simple_server/img/owl-rose.jpeg
+++ b/ch12/simple_server/img/owl-rose.jpeg
--- a/ch12/simple_server/index.html
+++ b/ch12/simple_server/index.html
@ -0,0 +1,15 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head><title>Cool Owls!</title></head>
+  <body>
+    <h1>Welcome to our owl gallery</h1>
+    <div>
+      <img src="img/owl-alcohol.png" height="128" />
+      <img src="img/owl-book.png" height="128" />
+      <img src="img/owl-books.png" height="128" />
+      <img src="img/owl-ebook.jpg" height="128" />
+      <img src="img/owl-rose.jpeg" height="128" />
+    </div>
+    <p>Do you like these owls?</p>
+  </body>
+</html>
--- a/ch12/simple_server/serve.sh
+++ b/ch12/simple_server/serve.sh
@ -0,0 +1,4 @@
+#!/bin/sh
+
+# start a simple HTTP Server
+python -m http.server 8000