ch12

2021-10-28 17:40:56 +05:30 · 2021-10-28 17:40:56 +05:30 · d4c7ab6c61
commit d4c7ab6c61
parent 3cebc2fe91
11 changed files with 320 additions and 0 deletions
--- a/ch12/guiscrape.py
+++ b/ch12/guiscrape.py
@ -0,0 +1,174 @@
 # guiscrape.py
 from tkinter import *
 from tkinter import ttk, filedialog, messagebox
 import base64
 import json
 from pathlib import Path
 from bs4 import BeautifulSoup
 import requests
 config = {}
 def fetch_url():
    url = _url.get()
    config['images'] = []
    _images.set(())  # initialised as an empty tuple
    try:
        page = requests.get(url)
    except requests.RequestException as err:
        sb(str(err))
    else:
        soup = BeautifulSoup(page.content, 'html.parser')
        images = fetch_images(soup, url)
        if images:
            _images.set(tuple(img['name'] for img in images))
            sb('Images found: {}'.format(len(images)))
        else:
            sb('No images found')
        config['images'] = images
 def fetch_images(soup, base_url):
    images = []
    for img in soup.findAll('img'):
        src = img.get('src')
        img_url = f'{base_url}/{src}'
        name = img_url.split('/')[-1]
        images.append(dict(name=name, url=img_url))
    return images
 def save():
    if not config.get('images'):
        alert('No images to save')
        return
    if _save_method.get() == 'img':
        dirname = filedialog.askdirectory(mustexist=True)
        save_images(dirname)
    else:
        filename = filedialog.asksaveasfilename(
            initialfile='images.json',
            filetypes=[('JSON', '.json')])
        save_json(filename)
 def save_images(dirname):
    if dirname and config.get('images'):
        for img in config['images']:
            img_data = requests.get(img['url']).content
            filename = Path(dirname).joinpath(img['name'])
            with open(filename, 'wb') as f:
                f.write(img_data)
        alert('Done')
 def save_json(filename):
    if filename and config.get('images'):
        data = {}
        for img in config['images']:
            img_data = requests.get(img['url']).content
            b64_img_data = base64.b64encode(img_data)
            str_img_data = b64_img_data.decode('utf-8')
            data[img['name']] = str_img_data
        with open(filename, 'w') as ijson:
            ijson.write(json.dumps(data))
        alert('Done')
 def sb(msg):
    _status_msg.set(msg)
 def alert(msg):
    messagebox.showinfo(message=msg)
 if __name__ == "__main__":
    _root = Tk()
    _root.title('Scrape app')
    _mainframe = ttk.Frame(_root, padding='5 5 5 5')
    _mainframe.grid(row=0, column=0, sticky=(E, W, N, S))
    _url_frame = ttk.LabelFrame(
        _mainframe, text='URL', padding='5 5 5 5')
    _url_frame.grid(row=0, column=0, sticky=(E, W))
    _url_frame.columnconfigure(0, weight=1)
    _url_frame.rowconfigure(0, weight=1)
    _url = StringVar()
    _url.set('http://localhost:8000')
    _url_entry = ttk.Entry(
        _url_frame, width=40, textvariable=_url)
    _url_entry.grid(row=0, column=0, sticky=(E, W, S, N), padx=5)
    _fetch_btn = ttk.Button(
        _url_frame, text='Fetch info', command=fetch_url)
    _fetch_btn.grid(row=0, column=1, sticky=W, padx=5)
    _img_frame = ttk.LabelFrame(
        _mainframe, text='Content', padding='9 0 0 0')
    _img_frame.grid(row=1, column=0, sticky=(N, S, E, W))
    _images = StringVar()
    _img_listbox = Listbox(
        _img_frame, listvariable=_images, height=6, width=25)
    _img_listbox.grid(row=0, column=0, sticky=(E, W), pady=5)
    _scrollbar = ttk.Scrollbar(
        _img_frame, orient=VERTICAL, command=_img_listbox.yview)
    _scrollbar.grid(row=0, column=1, sticky=(S, N), pady=6)
    _img_listbox.configure(yscrollcommand=_scrollbar.set)
    _radio_frame = ttk.Frame(_img_frame)
    _radio_frame.grid(row=0, column=2, sticky=(N, S, W, E))
    _choice_lbl = ttk.Label(
        _radio_frame, text="Choose how to save images")
    _choice_lbl.grid(row=0, column=0, padx=5, pady=5)
    _save_method = StringVar()
    _save_method.set('img')
    _img_only_radio = ttk.Radiobutton(
        _radio_frame, text='As Images', variable=_save_method,
        value='img')
    _img_only_radio.grid(
        row=1, column=0, padx=5, pady=2, sticky=W)
    _img_only_radio.configure(state='normal')
    _json_radio = ttk.Radiobutton(
        _radio_frame, text='As JSON', variable=_save_method,
        value='json')
    _json_radio.grid(row=2, column=0, padx=5, pady=2, sticky=W)
    _scrape_btn = ttk.Button(
        _mainframe, text='Scrape!', command=save)
    _scrape_btn.grid(row=2, column=0, sticky=E, pady=5)
    _status_frame = ttk.Frame(
        _root, relief='sunken', padding='2 2 2 2')
    _status_frame.grid(row=1, column=0, sticky=(E, W, S))
    _status_msg = StringVar()
    _status_msg.set('Type a URL to start scraping...')
    _status = ttk.Label(
        _status_frame, textvariable=_status_msg, anchor=W)
    _status.grid(row=0, column=0, sticky=(E, W))
    _root.mainloop()
 """
 Example on reading a JSON file:
 with open('images.json', 'r') as f:
    data = json.loads(f.read())
 for (name, b64val) in data.items():
    with open(name, 'wb') as f:
        f.write(base64.b64decode(b64val))
 """
--- a/ch12/requirements/requirements.in
+++ b/ch12/requirements/requirements.in
@ -0,0 +1,2 @@
 beautifulsoup4
 requests
--- a/ch12/requirements/requirements.txt
+++ b/ch12/requirements/requirements.txt
@ -0,0 +1,20 @@
 #
 # This file is autogenerated by pip-compile with python 3.9
 # To update, run:
 #
 #    pip-compile requirements.in
 #
 beautifulsoup4==4.9.3
    # via -r requirements.in
 certifi==2021.5.30
    # via requests
 charset-normalizer==2.0.3
    # via requests
 idna==3.2
    # via requests
 requests==2.26.0
    # via -r requirements.in
 soupsieve==2.2.1
    # via beautifulsoup4
 urllib3==1.26.6
    # via requests
--- a/ch12/scrape.py
+++ b/ch12/scrape.py
@ -0,0 +1,105 @@
 # scrape.py
 import argparse
 import base64
 import json
 from pathlib import Path
 from bs4 import BeautifulSoup
 import requests
 def scrape(url, format_, type_):
    try:
        page = requests.get(url)
    except requests.RequestException as err:
        print(str(err))
    else:
        soup = BeautifulSoup(page.content, 'html.parser')
        images = fetch_images(soup, url)
        images = filter_images(images, type_)
        save(images, format_)
 def fetch_images(soup, base_url):
    # Works only with relative src paths.
    images = []
    for img in soup.findAll('img'):
        src = img.get('src')
        img_url = f'{base_url}/{src}'
        name = img_url.split('/')[-1]
        images.append(dict(name=name, url=img_url))
    return images
 def filter_images(images, type_):
    if type_ == 'all':
        return images
    ext_map = {
        'png': ['.png'],
        'jpg': ['.jpg', '.jpeg'],
    }
    return [
        img for img in images
        if matches_extension(img['name'], ext_map[type_])
    ]
 def matches_extension(filename, extension_list):
    extension = Path(filename.lower()).suffix
    return extension in extension_list
 def save(images, format_):
    if images:
        if format_ == 'img':
            save_images(images)
        else:
            save_json(images)
        print('Done')
    else:
        print('No images to save.')
 def save_images(images):
    for img in images:
        img_data = requests.get(img['url']).content
        with open(img['name'], 'wb') as f:
            f.write(img_data)
 def save_json(images):
    data = {}
    for img in images:
        img_data = requests.get(img['url']).content
        b64_img_data = base64.b64encode(img_data)
        str_img_data = b64_img_data.decode('utf-8')
        data[img['name']] = str_img_data
    with open('images.json', 'w') as ijson:
        ijson.write(json.dumps(data))
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Scrape a webpage.')
    parser.add_argument(
        '-t',
        '--type',
        choices=['all', 'png', 'jpg'],
        default='all',
        help='The image type we want to scrape.')
    parser.add_argument(
        '-f',
        '--format',
        choices=['img', 'json'],
        default='img',
        help='The format images are saved to.')
    parser.add_argument(
        'url',
        help='The URL we want to scrape for images.')
    args = parser.parse_args()
    scrape(args.url, args.format, args.type)
--- a/ch12/simple_server/img/owl-alcohol.png
+++ b/ch12/simple_server/img/owl-alcohol.png
--- a/ch12/simple_server/img/owl-book.png
+++ b/ch12/simple_server/img/owl-book.png
--- a/ch12/simple_server/img/owl-books.png
+++ b/ch12/simple_server/img/owl-books.png
--- a/ch12/simple_server/img/owl-ebook.jpg
+++ b/ch12/simple_server/img/owl-ebook.jpg
--- a/ch12/simple_server/img/owl-rose.jpeg
+++ b/ch12/simple_server/img/owl-rose.jpeg
--- a/ch12/simple_server/index.html
+++ b/ch12/simple_server/index.html
@ -0,0 +1,15 @@
 <!DOCTYPE html>
 <html lang="en">
  <head><title>Cool Owls!</title></head>
  <body>
    <h1>Welcome to our owl gallery</h1>
    <div>
      <img src="img/owl-alcohol.png" height="128" />
      <img src="img/owl-book.png" height="128" />
      <img src="img/owl-books.png" height="128" />
      <img src="img/owl-ebook.jpg" height="128" />
      <img src="img/owl-rose.jpeg" height="128" />
    </div>
    <p>Do you like these owls?</p>
  </body>
 </html>
--- a/ch12/simple_server/serve.sh
+++ b/ch12/simple_server/serve.sh
@ -0,0 +1,4 @@
 #!/bin/sh
 # start a simple HTTP Server
 python -m http.server 8000