This commit is contained in:
adii1823 2021-10-28 17:40:56 +05:30
parent 3cebc2fe91
commit d4c7ab6c61
11 changed files with 320 additions and 0 deletions

174
ch12/guiscrape.py Normal file
View File

@ -0,0 +1,174 @@
# guiscrape.py
from tkinter import *
from tkinter import ttk, filedialog, messagebox
import base64
import json
from pathlib import Path
from bs4 import BeautifulSoup
import requests
config = {}
def fetch_url():
url = _url.get()
config['images'] = []
_images.set(()) # initialised as an empty tuple
try:
page = requests.get(url)
except requests.RequestException as err:
sb(str(err))
else:
soup = BeautifulSoup(page.content, 'html.parser')
images = fetch_images(soup, url)
if images:
_images.set(tuple(img['name'] for img in images))
sb('Images found: {}'.format(len(images)))
else:
sb('No images found')
config['images'] = images
def fetch_images(soup, base_url):
images = []
for img in soup.findAll('img'):
src = img.get('src')
img_url = f'{base_url}/{src}'
name = img_url.split('/')[-1]
images.append(dict(name=name, url=img_url))
return images
def save():
if not config.get('images'):
alert('No images to save')
return
if _save_method.get() == 'img':
dirname = filedialog.askdirectory(mustexist=True)
save_images(dirname)
else:
filename = filedialog.asksaveasfilename(
initialfile='images.json',
filetypes=[('JSON', '.json')])
save_json(filename)
def save_images(dirname):
if dirname and config.get('images'):
for img in config['images']:
img_data = requests.get(img['url']).content
filename = Path(dirname).joinpath(img['name'])
with open(filename, 'wb') as f:
f.write(img_data)
alert('Done')
def save_json(filename):
if filename and config.get('images'):
data = {}
for img in config['images']:
img_data = requests.get(img['url']).content
b64_img_data = base64.b64encode(img_data)
str_img_data = b64_img_data.decode('utf-8')
data[img['name']] = str_img_data
with open(filename, 'w') as ijson:
ijson.write(json.dumps(data))
alert('Done')
def sb(msg):
_status_msg.set(msg)
def alert(msg):
messagebox.showinfo(message=msg)
if __name__ == "__main__":
_root = Tk()
_root.title('Scrape app')
_mainframe = ttk.Frame(_root, padding='5 5 5 5')
_mainframe.grid(row=0, column=0, sticky=(E, W, N, S))
_url_frame = ttk.LabelFrame(
_mainframe, text='URL', padding='5 5 5 5')
_url_frame.grid(row=0, column=0, sticky=(E, W))
_url_frame.columnconfigure(0, weight=1)
_url_frame.rowconfigure(0, weight=1)
_url = StringVar()
_url.set('http://localhost:8000')
_url_entry = ttk.Entry(
_url_frame, width=40, textvariable=_url)
_url_entry.grid(row=0, column=0, sticky=(E, W, S, N), padx=5)
_fetch_btn = ttk.Button(
_url_frame, text='Fetch info', command=fetch_url)
_fetch_btn.grid(row=0, column=1, sticky=W, padx=5)
_img_frame = ttk.LabelFrame(
_mainframe, text='Content', padding='9 0 0 0')
_img_frame.grid(row=1, column=0, sticky=(N, S, E, W))
_images = StringVar()
_img_listbox = Listbox(
_img_frame, listvariable=_images, height=6, width=25)
_img_listbox.grid(row=0, column=0, sticky=(E, W), pady=5)
_scrollbar = ttk.Scrollbar(
_img_frame, orient=VERTICAL, command=_img_listbox.yview)
_scrollbar.grid(row=0, column=1, sticky=(S, N), pady=6)
_img_listbox.configure(yscrollcommand=_scrollbar.set)
_radio_frame = ttk.Frame(_img_frame)
_radio_frame.grid(row=0, column=2, sticky=(N, S, W, E))
_choice_lbl = ttk.Label(
_radio_frame, text="Choose how to save images")
_choice_lbl.grid(row=0, column=0, padx=5, pady=5)
_save_method = StringVar()
_save_method.set('img')
_img_only_radio = ttk.Radiobutton(
_radio_frame, text='As Images', variable=_save_method,
value='img')
_img_only_radio.grid(
row=1, column=0, padx=5, pady=2, sticky=W)
_img_only_radio.configure(state='normal')
_json_radio = ttk.Radiobutton(
_radio_frame, text='As JSON', variable=_save_method,
value='json')
_json_radio.grid(row=2, column=0, padx=5, pady=2, sticky=W)
_scrape_btn = ttk.Button(
_mainframe, text='Scrape!', command=save)
_scrape_btn.grid(row=2, column=0, sticky=E, pady=5)
_status_frame = ttk.Frame(
_root, relief='sunken', padding='2 2 2 2')
_status_frame.grid(row=1, column=0, sticky=(E, W, S))
_status_msg = StringVar()
_status_msg.set('Type a URL to start scraping...')
_status = ttk.Label(
_status_frame, textvariable=_status_msg, anchor=W)
_status.grid(row=0, column=0, sticky=(E, W))
_root.mainloop()
"""
Example on reading a JSON file:
with open('images.json', 'r') as f:
data = json.loads(f.read())
for (name, b64val) in data.items():
with open(name, 'wb') as f:
f.write(base64.b64decode(b64val))
"""

View File

@ -0,0 +1,2 @@
beautifulsoup4
requests

View File

@ -0,0 +1,20 @@
#
# This file is autogenerated by pip-compile with python 3.9
# To update, run:
#
# pip-compile requirements.in
#
beautifulsoup4==4.9.3
# via -r requirements.in
certifi==2021.5.30
# via requests
charset-normalizer==2.0.3
# via requests
idna==3.2
# via requests
requests==2.26.0
# via -r requirements.in
soupsieve==2.2.1
# via beautifulsoup4
urllib3==1.26.6
# via requests

105
ch12/scrape.py Normal file
View File

@ -0,0 +1,105 @@
# scrape.py
import argparse
import base64
import json
from pathlib import Path
from bs4 import BeautifulSoup
import requests
def scrape(url, format_, type_):
try:
page = requests.get(url)
except requests.RequestException as err:
print(str(err))
else:
soup = BeautifulSoup(page.content, 'html.parser')
images = fetch_images(soup, url)
images = filter_images(images, type_)
save(images, format_)
def fetch_images(soup, base_url):
# Works only with relative src paths.
images = []
for img in soup.findAll('img'):
src = img.get('src')
img_url = f'{base_url}/{src}'
name = img_url.split('/')[-1]
images.append(dict(name=name, url=img_url))
return images
def filter_images(images, type_):
if type_ == 'all':
return images
ext_map = {
'png': ['.png'],
'jpg': ['.jpg', '.jpeg'],
}
return [
img for img in images
if matches_extension(img['name'], ext_map[type_])
]
def matches_extension(filename, extension_list):
extension = Path(filename.lower()).suffix
return extension in extension_list
def save(images, format_):
if images:
if format_ == 'img':
save_images(images)
else:
save_json(images)
print('Done')
else:
print('No images to save.')
def save_images(images):
for img in images:
img_data = requests.get(img['url']).content
with open(img['name'], 'wb') as f:
f.write(img_data)
def save_json(images):
data = {}
for img in images:
img_data = requests.get(img['url']).content
b64_img_data = base64.b64encode(img_data)
str_img_data = b64_img_data.decode('utf-8')
data[img['name']] = str_img_data
with open('images.json', 'w') as ijson:
ijson.write(json.dumps(data))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Scrape a webpage.')
parser.add_argument(
'-t',
'--type',
choices=['all', 'png', 'jpg'],
default='all',
help='The image type we want to scrape.')
parser.add_argument(
'-f',
'--format',
choices=['img', 'json'],
default='img',
help='The format images are saved to.')
parser.add_argument(
'url',
help='The URL we want to scrape for images.')
args = parser.parse_args()
scrape(args.url, args.format, args.type)

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 58 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 71 KiB

View File

@ -0,0 +1,15 @@
<!DOCTYPE html>
<html lang="en">
<head><title>Cool Owls!</title></head>
<body>
<h1>Welcome to our owl gallery</h1>
<div>
<img src="img/owl-alcohol.png" height="128" />
<img src="img/owl-book.png" height="128" />
<img src="img/owl-books.png" height="128" />
<img src="img/owl-ebook.jpg" height="128" />
<img src="img/owl-rose.jpeg" height="128" />
</div>
<p>Do you like these owls?</p>
</body>
</html>

View File

@ -0,0 +1,4 @@
#!/bin/sh
# start a simple HTTP Server
python -m http.server 8000