106 lines
2.5 KiB
Python
106 lines
2.5 KiB
Python
# scrape.py
|
|
import argparse
|
|
import base64
|
|
import json
|
|
from pathlib import Path
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
|
|
|
|
def scrape(url, format_, type_):
|
|
try:
|
|
page = requests.get(url)
|
|
except requests.RequestException as err:
|
|
print(str(err))
|
|
else:
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
images = fetch_images(soup, url)
|
|
images = filter_images(images, type_)
|
|
save(images, format_)
|
|
|
|
|
|
def fetch_images(soup, base_url):
|
|
# Works only with relative src paths.
|
|
images = []
|
|
for img in soup.findAll('img'):
|
|
src = img.get('src')
|
|
img_url = f'{base_url}/{src}'
|
|
name = img_url.split('/')[-1]
|
|
images.append(dict(name=name, url=img_url))
|
|
return images
|
|
|
|
|
|
def filter_images(images, type_):
|
|
if type_ == 'all':
|
|
return images
|
|
ext_map = {
|
|
'png': ['.png'],
|
|
'jpg': ['.jpg', '.jpeg'],
|
|
}
|
|
return [
|
|
img for img in images
|
|
if matches_extension(img['name'], ext_map[type_])
|
|
]
|
|
|
|
|
|
def matches_extension(filename, extension_list):
|
|
extension = Path(filename.lower()).suffix
|
|
return extension in extension_list
|
|
|
|
|
|
def save(images, format_):
|
|
if images:
|
|
if format_ == 'img':
|
|
save_images(images)
|
|
else:
|
|
save_json(images)
|
|
print('Done')
|
|
else:
|
|
print('No images to save.')
|
|
|
|
|
|
def save_images(images):
|
|
for img in images:
|
|
img_data = requests.get(img['url']).content
|
|
with open(img['name'], 'wb') as f:
|
|
f.write(img_data)
|
|
|
|
|
|
def save_json(images):
|
|
data = {}
|
|
for img in images:
|
|
img_data = requests.get(img['url']).content
|
|
b64_img_data = base64.b64encode(img_data)
|
|
str_img_data = b64_img_data.decode('utf-8')
|
|
data[img['name']] = str_img_data
|
|
|
|
with open('images.json', 'w') as ijson:
|
|
ijson.write(json.dumps(data))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description='Scrape a webpage.')
|
|
parser.add_argument(
|
|
'-t',
|
|
'--type',
|
|
choices=['all', 'png', 'jpg'],
|
|
default='all',
|
|
help='The image type we want to scrape.')
|
|
|
|
parser.add_argument(
|
|
'-f',
|
|
'--format',
|
|
choices=['img', 'json'],
|
|
default='img',
|
|
help='The format images are saved to.')
|
|
|
|
parser.add_argument(
|
|
'url',
|
|
help='The URL we want to scrape for images.')
|
|
|
|
args = parser.parse_args()
|
|
scrape(args.url, args.format, args.type)
|