ch12
This commit is contained in:
parent
3cebc2fe91
commit
d4c7ab6c61
174
ch12/guiscrape.py
Normal file
174
ch12/guiscrape.py
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
# guiscrape.py
|
||||||
|
from tkinter import *
|
||||||
|
from tkinter import ttk, filedialog, messagebox
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
config = {}
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_url():
|
||||||
|
url = _url.get()
|
||||||
|
config['images'] = []
|
||||||
|
_images.set(()) # initialised as an empty tuple
|
||||||
|
try:
|
||||||
|
page = requests.get(url)
|
||||||
|
except requests.RequestException as err:
|
||||||
|
sb(str(err))
|
||||||
|
else:
|
||||||
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
images = fetch_images(soup, url)
|
||||||
|
if images:
|
||||||
|
_images.set(tuple(img['name'] for img in images))
|
||||||
|
sb('Images found: {}'.format(len(images)))
|
||||||
|
else:
|
||||||
|
sb('No images found')
|
||||||
|
config['images'] = images
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_images(soup, base_url):
|
||||||
|
images = []
|
||||||
|
for img in soup.findAll('img'):
|
||||||
|
src = img.get('src')
|
||||||
|
img_url = f'{base_url}/{src}'
|
||||||
|
name = img_url.split('/')[-1]
|
||||||
|
images.append(dict(name=name, url=img_url))
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def save():
|
||||||
|
if not config.get('images'):
|
||||||
|
alert('No images to save')
|
||||||
|
return
|
||||||
|
|
||||||
|
if _save_method.get() == 'img':
|
||||||
|
dirname = filedialog.askdirectory(mustexist=True)
|
||||||
|
save_images(dirname)
|
||||||
|
else:
|
||||||
|
filename = filedialog.asksaveasfilename(
|
||||||
|
initialfile='images.json',
|
||||||
|
filetypes=[('JSON', '.json')])
|
||||||
|
save_json(filename)
|
||||||
|
|
||||||
|
|
||||||
|
def save_images(dirname):
|
||||||
|
if dirname and config.get('images'):
|
||||||
|
for img in config['images']:
|
||||||
|
img_data = requests.get(img['url']).content
|
||||||
|
filename = Path(dirname).joinpath(img['name'])
|
||||||
|
with open(filename, 'wb') as f:
|
||||||
|
f.write(img_data)
|
||||||
|
alert('Done')
|
||||||
|
|
||||||
|
|
||||||
|
def save_json(filename):
|
||||||
|
if filename and config.get('images'):
|
||||||
|
data = {}
|
||||||
|
for img in config['images']:
|
||||||
|
img_data = requests.get(img['url']).content
|
||||||
|
b64_img_data = base64.b64encode(img_data)
|
||||||
|
str_img_data = b64_img_data.decode('utf-8')
|
||||||
|
data[img['name']] = str_img_data
|
||||||
|
|
||||||
|
with open(filename, 'w') as ijson:
|
||||||
|
ijson.write(json.dumps(data))
|
||||||
|
alert('Done')
|
||||||
|
|
||||||
|
|
||||||
|
def sb(msg):
|
||||||
|
_status_msg.set(msg)
|
||||||
|
|
||||||
|
|
||||||
|
def alert(msg):
|
||||||
|
messagebox.showinfo(message=msg)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
_root = Tk()
|
||||||
|
_root.title('Scrape app')
|
||||||
|
|
||||||
|
_mainframe = ttk.Frame(_root, padding='5 5 5 5')
|
||||||
|
_mainframe.grid(row=0, column=0, sticky=(E, W, N, S))
|
||||||
|
|
||||||
|
_url_frame = ttk.LabelFrame(
|
||||||
|
_mainframe, text='URL', padding='5 5 5 5')
|
||||||
|
_url_frame.grid(row=0, column=0, sticky=(E, W))
|
||||||
|
_url_frame.columnconfigure(0, weight=1)
|
||||||
|
_url_frame.rowconfigure(0, weight=1)
|
||||||
|
|
||||||
|
_url = StringVar()
|
||||||
|
_url.set('http://localhost:8000')
|
||||||
|
_url_entry = ttk.Entry(
|
||||||
|
_url_frame, width=40, textvariable=_url)
|
||||||
|
_url_entry.grid(row=0, column=0, sticky=(E, W, S, N), padx=5)
|
||||||
|
|
||||||
|
_fetch_btn = ttk.Button(
|
||||||
|
_url_frame, text='Fetch info', command=fetch_url)
|
||||||
|
_fetch_btn.grid(row=0, column=1, sticky=W, padx=5)
|
||||||
|
|
||||||
|
_img_frame = ttk.LabelFrame(
|
||||||
|
_mainframe, text='Content', padding='9 0 0 0')
|
||||||
|
_img_frame.grid(row=1, column=0, sticky=(N, S, E, W))
|
||||||
|
|
||||||
|
_images = StringVar()
|
||||||
|
_img_listbox = Listbox(
|
||||||
|
_img_frame, listvariable=_images, height=6, width=25)
|
||||||
|
_img_listbox.grid(row=0, column=0, sticky=(E, W), pady=5)
|
||||||
|
_scrollbar = ttk.Scrollbar(
|
||||||
|
_img_frame, orient=VERTICAL, command=_img_listbox.yview)
|
||||||
|
_scrollbar.grid(row=0, column=1, sticky=(S, N), pady=6)
|
||||||
|
_img_listbox.configure(yscrollcommand=_scrollbar.set)
|
||||||
|
|
||||||
|
_radio_frame = ttk.Frame(_img_frame)
|
||||||
|
_radio_frame.grid(row=0, column=2, sticky=(N, S, W, E))
|
||||||
|
|
||||||
|
_choice_lbl = ttk.Label(
|
||||||
|
_radio_frame, text="Choose how to save images")
|
||||||
|
_choice_lbl.grid(row=0, column=0, padx=5, pady=5)
|
||||||
|
|
||||||
|
_save_method = StringVar()
|
||||||
|
_save_method.set('img')
|
||||||
|
_img_only_radio = ttk.Radiobutton(
|
||||||
|
_radio_frame, text='As Images', variable=_save_method,
|
||||||
|
value='img')
|
||||||
|
_img_only_radio.grid(
|
||||||
|
row=1, column=0, padx=5, pady=2, sticky=W)
|
||||||
|
_img_only_radio.configure(state='normal')
|
||||||
|
_json_radio = ttk.Radiobutton(
|
||||||
|
_radio_frame, text='As JSON', variable=_save_method,
|
||||||
|
value='json')
|
||||||
|
_json_radio.grid(row=2, column=0, padx=5, pady=2, sticky=W)
|
||||||
|
|
||||||
|
_scrape_btn = ttk.Button(
|
||||||
|
_mainframe, text='Scrape!', command=save)
|
||||||
|
_scrape_btn.grid(row=2, column=0, sticky=E, pady=5)
|
||||||
|
|
||||||
|
_status_frame = ttk.Frame(
|
||||||
|
_root, relief='sunken', padding='2 2 2 2')
|
||||||
|
_status_frame.grid(row=1, column=0, sticky=(E, W, S))
|
||||||
|
|
||||||
|
_status_msg = StringVar()
|
||||||
|
_status_msg.set('Type a URL to start scraping...')
|
||||||
|
_status = ttk.Label(
|
||||||
|
_status_frame, textvariable=_status_msg, anchor=W)
|
||||||
|
_status.grid(row=0, column=0, sticky=(E, W))
|
||||||
|
|
||||||
|
_root.mainloop()
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
Example on reading a JSON file:
|
||||||
|
|
||||||
|
with open('images.json', 'r') as f:
|
||||||
|
data = json.loads(f.read())
|
||||||
|
|
||||||
|
for (name, b64val) in data.items():
|
||||||
|
with open(name, 'wb') as f:
|
||||||
|
f.write(base64.b64decode(b64val))
|
||||||
|
"""
|
||||||
2
ch12/requirements/requirements.in
Normal file
2
ch12/requirements/requirements.in
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
beautifulsoup4
|
||||||
|
requests
|
||||||
20
ch12/requirements/requirements.txt
Normal file
20
ch12/requirements/requirements.txt
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
#
|
||||||
|
# This file is autogenerated by pip-compile with python 3.9
|
||||||
|
# To update, run:
|
||||||
|
#
|
||||||
|
# pip-compile requirements.in
|
||||||
|
#
|
||||||
|
beautifulsoup4==4.9.3
|
||||||
|
# via -r requirements.in
|
||||||
|
certifi==2021.5.30
|
||||||
|
# via requests
|
||||||
|
charset-normalizer==2.0.3
|
||||||
|
# via requests
|
||||||
|
idna==3.2
|
||||||
|
# via requests
|
||||||
|
requests==2.26.0
|
||||||
|
# via -r requirements.in
|
||||||
|
soupsieve==2.2.1
|
||||||
|
# via beautifulsoup4
|
||||||
|
urllib3==1.26.6
|
||||||
|
# via requests
|
||||||
105
ch12/scrape.py
Normal file
105
ch12/scrape.py
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
# scrape.py
|
||||||
|
import argparse
|
||||||
|
import base64
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
def scrape(url, format_, type_):
|
||||||
|
try:
|
||||||
|
page = requests.get(url)
|
||||||
|
except requests.RequestException as err:
|
||||||
|
print(str(err))
|
||||||
|
else:
|
||||||
|
soup = BeautifulSoup(page.content, 'html.parser')
|
||||||
|
images = fetch_images(soup, url)
|
||||||
|
images = filter_images(images, type_)
|
||||||
|
save(images, format_)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_images(soup, base_url):
|
||||||
|
# Works only with relative src paths.
|
||||||
|
images = []
|
||||||
|
for img in soup.findAll('img'):
|
||||||
|
src = img.get('src')
|
||||||
|
img_url = f'{base_url}/{src}'
|
||||||
|
name = img_url.split('/')[-1]
|
||||||
|
images.append(dict(name=name, url=img_url))
|
||||||
|
return images
|
||||||
|
|
||||||
|
|
||||||
|
def filter_images(images, type_):
|
||||||
|
if type_ == 'all':
|
||||||
|
return images
|
||||||
|
ext_map = {
|
||||||
|
'png': ['.png'],
|
||||||
|
'jpg': ['.jpg', '.jpeg'],
|
||||||
|
}
|
||||||
|
return [
|
||||||
|
img for img in images
|
||||||
|
if matches_extension(img['name'], ext_map[type_])
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def matches_extension(filename, extension_list):
|
||||||
|
extension = Path(filename.lower()).suffix
|
||||||
|
return extension in extension_list
|
||||||
|
|
||||||
|
|
||||||
|
def save(images, format_):
|
||||||
|
if images:
|
||||||
|
if format_ == 'img':
|
||||||
|
save_images(images)
|
||||||
|
else:
|
||||||
|
save_json(images)
|
||||||
|
print('Done')
|
||||||
|
else:
|
||||||
|
print('No images to save.')
|
||||||
|
|
||||||
|
|
||||||
|
def save_images(images):
|
||||||
|
for img in images:
|
||||||
|
img_data = requests.get(img['url']).content
|
||||||
|
with open(img['name'], 'wb') as f:
|
||||||
|
f.write(img_data)
|
||||||
|
|
||||||
|
|
||||||
|
def save_json(images):
|
||||||
|
data = {}
|
||||||
|
for img in images:
|
||||||
|
img_data = requests.get(img['url']).content
|
||||||
|
b64_img_data = base64.b64encode(img_data)
|
||||||
|
str_img_data = b64_img_data.decode('utf-8')
|
||||||
|
data[img['name']] = str_img_data
|
||||||
|
|
||||||
|
with open('images.json', 'w') as ijson:
|
||||||
|
ijson.write(json.dumps(data))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Scrape a webpage.')
|
||||||
|
parser.add_argument(
|
||||||
|
'-t',
|
||||||
|
'--type',
|
||||||
|
choices=['all', 'png', 'jpg'],
|
||||||
|
default='all',
|
||||||
|
help='The image type we want to scrape.')
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'-f',
|
||||||
|
'--format',
|
||||||
|
choices=['img', 'json'],
|
||||||
|
default='img',
|
||||||
|
help='The format images are saved to.')
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'url',
|
||||||
|
help='The URL we want to scrape for images.')
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
scrape(args.url, args.format, args.type)
|
||||||
BIN
ch12/simple_server/img/owl-alcohol.png
Normal file
BIN
ch12/simple_server/img/owl-alcohol.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 32 KiB |
BIN
ch12/simple_server/img/owl-book.png
Normal file
BIN
ch12/simple_server/img/owl-book.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 25 KiB |
BIN
ch12/simple_server/img/owl-books.png
Normal file
BIN
ch12/simple_server/img/owl-books.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 37 KiB |
BIN
ch12/simple_server/img/owl-ebook.jpg
Normal file
BIN
ch12/simple_server/img/owl-ebook.jpg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 58 KiB |
BIN
ch12/simple_server/img/owl-rose.jpeg
Normal file
BIN
ch12/simple_server/img/owl-rose.jpeg
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 71 KiB |
15
ch12/simple_server/index.html
Normal file
15
ch12/simple_server/index.html
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
<!DOCTYPE html>
|
||||||
|
<html lang="en">
|
||||||
|
<head><title>Cool Owls!</title></head>
|
||||||
|
<body>
|
||||||
|
<h1>Welcome to our owl gallery</h1>
|
||||||
|
<div>
|
||||||
|
<img src="img/owl-alcohol.png" height="128" />
|
||||||
|
<img src="img/owl-book.png" height="128" />
|
||||||
|
<img src="img/owl-books.png" height="128" />
|
||||||
|
<img src="img/owl-ebook.jpg" height="128" />
|
||||||
|
<img src="img/owl-rose.jpeg" height="128" />
|
||||||
|
</div>
|
||||||
|
<p>Do you like these owls?</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
4
ch12/simple_server/serve.sh
Normal file
4
ch12/simple_server/serve.sh
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
|
||||||
|
# start a simple HTTP Server
|
||||||
|
python -m http.server 8000
|
||||||
Loading…
x
Reference in New Issue
Block a user