ch12
This commit is contained in:
parent
3cebc2fe91
commit
d4c7ab6c61
174
ch12/guiscrape.py
Normal file
174
ch12/guiscrape.py
Normal file
@ -0,0 +1,174 @@
|
||||
# guiscrape.py
|
||||
from tkinter import *
|
||||
from tkinter import ttk, filedialog, messagebox
|
||||
import base64
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
|
||||
config = {}
|
||||
|
||||
|
||||
def fetch_url():
|
||||
url = _url.get()
|
||||
config['images'] = []
|
||||
_images.set(()) # initialised as an empty tuple
|
||||
try:
|
||||
page = requests.get(url)
|
||||
except requests.RequestException as err:
|
||||
sb(str(err))
|
||||
else:
|
||||
soup = BeautifulSoup(page.content, 'html.parser')
|
||||
images = fetch_images(soup, url)
|
||||
if images:
|
||||
_images.set(tuple(img['name'] for img in images))
|
||||
sb('Images found: {}'.format(len(images)))
|
||||
else:
|
||||
sb('No images found')
|
||||
config['images'] = images
|
||||
|
||||
|
||||
def fetch_images(soup, base_url):
|
||||
images = []
|
||||
for img in soup.findAll('img'):
|
||||
src = img.get('src')
|
||||
img_url = f'{base_url}/{src}'
|
||||
name = img_url.split('/')[-1]
|
||||
images.append(dict(name=name, url=img_url))
|
||||
return images
|
||||
|
||||
|
||||
def save():
|
||||
if not config.get('images'):
|
||||
alert('No images to save')
|
||||
return
|
||||
|
||||
if _save_method.get() == 'img':
|
||||
dirname = filedialog.askdirectory(mustexist=True)
|
||||
save_images(dirname)
|
||||
else:
|
||||
filename = filedialog.asksaveasfilename(
|
||||
initialfile='images.json',
|
||||
filetypes=[('JSON', '.json')])
|
||||
save_json(filename)
|
||||
|
||||
|
||||
def save_images(dirname):
|
||||
if dirname and config.get('images'):
|
||||
for img in config['images']:
|
||||
img_data = requests.get(img['url']).content
|
||||
filename = Path(dirname).joinpath(img['name'])
|
||||
with open(filename, 'wb') as f:
|
||||
f.write(img_data)
|
||||
alert('Done')
|
||||
|
||||
|
||||
def save_json(filename):
|
||||
if filename and config.get('images'):
|
||||
data = {}
|
||||
for img in config['images']:
|
||||
img_data = requests.get(img['url']).content
|
||||
b64_img_data = base64.b64encode(img_data)
|
||||
str_img_data = b64_img_data.decode('utf-8')
|
||||
data[img['name']] = str_img_data
|
||||
|
||||
with open(filename, 'w') as ijson:
|
||||
ijson.write(json.dumps(data))
|
||||
alert('Done')
|
||||
|
||||
|
||||
def sb(msg):
|
||||
_status_msg.set(msg)
|
||||
|
||||
|
||||
def alert(msg):
|
||||
messagebox.showinfo(message=msg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
_root = Tk()
|
||||
_root.title('Scrape app')
|
||||
|
||||
_mainframe = ttk.Frame(_root, padding='5 5 5 5')
|
||||
_mainframe.grid(row=0, column=0, sticky=(E, W, N, S))
|
||||
|
||||
_url_frame = ttk.LabelFrame(
|
||||
_mainframe, text='URL', padding='5 5 5 5')
|
||||
_url_frame.grid(row=0, column=0, sticky=(E, W))
|
||||
_url_frame.columnconfigure(0, weight=1)
|
||||
_url_frame.rowconfigure(0, weight=1)
|
||||
|
||||
_url = StringVar()
|
||||
_url.set('http://localhost:8000')
|
||||
_url_entry = ttk.Entry(
|
||||
_url_frame, width=40, textvariable=_url)
|
||||
_url_entry.grid(row=0, column=0, sticky=(E, W, S, N), padx=5)
|
||||
|
||||
_fetch_btn = ttk.Button(
|
||||
_url_frame, text='Fetch info', command=fetch_url)
|
||||
_fetch_btn.grid(row=0, column=1, sticky=W, padx=5)
|
||||
|
||||
_img_frame = ttk.LabelFrame(
|
||||
_mainframe, text='Content', padding='9 0 0 0')
|
||||
_img_frame.grid(row=1, column=0, sticky=(N, S, E, W))
|
||||
|
||||
_images = StringVar()
|
||||
_img_listbox = Listbox(
|
||||
_img_frame, listvariable=_images, height=6, width=25)
|
||||
_img_listbox.grid(row=0, column=0, sticky=(E, W), pady=5)
|
||||
_scrollbar = ttk.Scrollbar(
|
||||
_img_frame, orient=VERTICAL, command=_img_listbox.yview)
|
||||
_scrollbar.grid(row=0, column=1, sticky=(S, N), pady=6)
|
||||
_img_listbox.configure(yscrollcommand=_scrollbar.set)
|
||||
|
||||
_radio_frame = ttk.Frame(_img_frame)
|
||||
_radio_frame.grid(row=0, column=2, sticky=(N, S, W, E))
|
||||
|
||||
_choice_lbl = ttk.Label(
|
||||
_radio_frame, text="Choose how to save images")
|
||||
_choice_lbl.grid(row=0, column=0, padx=5, pady=5)
|
||||
|
||||
_save_method = StringVar()
|
||||
_save_method.set('img')
|
||||
_img_only_radio = ttk.Radiobutton(
|
||||
_radio_frame, text='As Images', variable=_save_method,
|
||||
value='img')
|
||||
_img_only_radio.grid(
|
||||
row=1, column=0, padx=5, pady=2, sticky=W)
|
||||
_img_only_radio.configure(state='normal')
|
||||
_json_radio = ttk.Radiobutton(
|
||||
_radio_frame, text='As JSON', variable=_save_method,
|
||||
value='json')
|
||||
_json_radio.grid(row=2, column=0, padx=5, pady=2, sticky=W)
|
||||
|
||||
_scrape_btn = ttk.Button(
|
||||
_mainframe, text='Scrape!', command=save)
|
||||
_scrape_btn.grid(row=2, column=0, sticky=E, pady=5)
|
||||
|
||||
_status_frame = ttk.Frame(
|
||||
_root, relief='sunken', padding='2 2 2 2')
|
||||
_status_frame.grid(row=1, column=0, sticky=(E, W, S))
|
||||
|
||||
_status_msg = StringVar()
|
||||
_status_msg.set('Type a URL to start scraping...')
|
||||
_status = ttk.Label(
|
||||
_status_frame, textvariable=_status_msg, anchor=W)
|
||||
_status.grid(row=0, column=0, sticky=(E, W))
|
||||
|
||||
_root.mainloop()
|
||||
|
||||
|
||||
"""
|
||||
Example on reading a JSON file:
|
||||
|
||||
with open('images.json', 'r') as f:
|
||||
data = json.loads(f.read())
|
||||
|
||||
for (name, b64val) in data.items():
|
||||
with open(name, 'wb') as f:
|
||||
f.write(base64.b64decode(b64val))
|
||||
"""
|
||||
2
ch12/requirements/requirements.in
Normal file
2
ch12/requirements/requirements.in
Normal file
@ -0,0 +1,2 @@
|
||||
beautifulsoup4
|
||||
requests
|
||||
20
ch12/requirements/requirements.txt
Normal file
20
ch12/requirements/requirements.txt
Normal file
@ -0,0 +1,20 @@
|
||||
#
|
||||
# This file is autogenerated by pip-compile with python 3.9
|
||||
# To update, run:
|
||||
#
|
||||
# pip-compile requirements.in
|
||||
#
|
||||
beautifulsoup4==4.9.3
|
||||
# via -r requirements.in
|
||||
certifi==2021.5.30
|
||||
# via requests
|
||||
charset-normalizer==2.0.3
|
||||
# via requests
|
||||
idna==3.2
|
||||
# via requests
|
||||
requests==2.26.0
|
||||
# via -r requirements.in
|
||||
soupsieve==2.2.1
|
||||
# via beautifulsoup4
|
||||
urllib3==1.26.6
|
||||
# via requests
|
||||
105
ch12/scrape.py
Normal file
105
ch12/scrape.py
Normal file
@ -0,0 +1,105 @@
|
||||
# scrape.py
|
||||
import argparse
|
||||
import base64
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
import requests
|
||||
|
||||
|
||||
def scrape(url, format_, type_):
|
||||
try:
|
||||
page = requests.get(url)
|
||||
except requests.RequestException as err:
|
||||
print(str(err))
|
||||
else:
|
||||
soup = BeautifulSoup(page.content, 'html.parser')
|
||||
images = fetch_images(soup, url)
|
||||
images = filter_images(images, type_)
|
||||
save(images, format_)
|
||||
|
||||
|
||||
def fetch_images(soup, base_url):
|
||||
# Works only with relative src paths.
|
||||
images = []
|
||||
for img in soup.findAll('img'):
|
||||
src = img.get('src')
|
||||
img_url = f'{base_url}/{src}'
|
||||
name = img_url.split('/')[-1]
|
||||
images.append(dict(name=name, url=img_url))
|
||||
return images
|
||||
|
||||
|
||||
def filter_images(images, type_):
|
||||
if type_ == 'all':
|
||||
return images
|
||||
ext_map = {
|
||||
'png': ['.png'],
|
||||
'jpg': ['.jpg', '.jpeg'],
|
||||
}
|
||||
return [
|
||||
img for img in images
|
||||
if matches_extension(img['name'], ext_map[type_])
|
||||
]
|
||||
|
||||
|
||||
def matches_extension(filename, extension_list):
|
||||
extension = Path(filename.lower()).suffix
|
||||
return extension in extension_list
|
||||
|
||||
|
||||
def save(images, format_):
|
||||
if images:
|
||||
if format_ == 'img':
|
||||
save_images(images)
|
||||
else:
|
||||
save_json(images)
|
||||
print('Done')
|
||||
else:
|
||||
print('No images to save.')
|
||||
|
||||
|
||||
def save_images(images):
|
||||
for img in images:
|
||||
img_data = requests.get(img['url']).content
|
||||
with open(img['name'], 'wb') as f:
|
||||
f.write(img_data)
|
||||
|
||||
|
||||
def save_json(images):
|
||||
data = {}
|
||||
for img in images:
|
||||
img_data = requests.get(img['url']).content
|
||||
b64_img_data = base64.b64encode(img_data)
|
||||
str_img_data = b64_img_data.decode('utf-8')
|
||||
data[img['name']] = str_img_data
|
||||
|
||||
with open('images.json', 'w') as ijson:
|
||||
ijson.write(json.dumps(data))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Scrape a webpage.')
|
||||
parser.add_argument(
|
||||
'-t',
|
||||
'--type',
|
||||
choices=['all', 'png', 'jpg'],
|
||||
default='all',
|
||||
help='The image type we want to scrape.')
|
||||
|
||||
parser.add_argument(
|
||||
'-f',
|
||||
'--format',
|
||||
choices=['img', 'json'],
|
||||
default='img',
|
||||
help='The format images are saved to.')
|
||||
|
||||
parser.add_argument(
|
||||
'url',
|
||||
help='The URL we want to scrape for images.')
|
||||
|
||||
args = parser.parse_args()
|
||||
scrape(args.url, args.format, args.type)
|
||||
BIN
ch12/simple_server/img/owl-alcohol.png
Normal file
BIN
ch12/simple_server/img/owl-alcohol.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
BIN
ch12/simple_server/img/owl-book.png
Normal file
BIN
ch12/simple_server/img/owl-book.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
BIN
ch12/simple_server/img/owl-books.png
Normal file
BIN
ch12/simple_server/img/owl-books.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 37 KiB |
BIN
ch12/simple_server/img/owl-ebook.jpg
Normal file
BIN
ch12/simple_server/img/owl-ebook.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 58 KiB |
BIN
ch12/simple_server/img/owl-rose.jpeg
Normal file
BIN
ch12/simple_server/img/owl-rose.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 71 KiB |
15
ch12/simple_server/index.html
Normal file
15
ch12/simple_server/index.html
Normal file
@ -0,0 +1,15 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head><title>Cool Owls!</title></head>
|
||||
<body>
|
||||
<h1>Welcome to our owl gallery</h1>
|
||||
<div>
|
||||
<img src="img/owl-alcohol.png" height="128" />
|
||||
<img src="img/owl-book.png" height="128" />
|
||||
<img src="img/owl-books.png" height="128" />
|
||||
<img src="img/owl-ebook.jpg" height="128" />
|
||||
<img src="img/owl-rose.jpeg" height="128" />
|
||||
</div>
|
||||
<p>Do you like these owls?</p>
|
||||
</body>
|
||||
</html>
|
||||
4
ch12/simple_server/serve.sh
Normal file
4
ch12/simple_server/serve.sh
Normal file
@ -0,0 +1,4 @@
|
||||
#!/bin/sh
|
||||
|
||||
# start a simple HTTP Server
|
||||
python -m http.server 8000
|
||||
Loading…
x
Reference in New Issue
Block a user