2021-10-28 17:40:56 +05:30

106 lines
2.5 KiB
Python

# scrape.py
import argparse
import base64
import json
from pathlib import Path
from bs4 import BeautifulSoup
import requests
def scrape(url, format_, type_):
try:
page = requests.get(url)
except requests.RequestException as err:
print(str(err))
else:
soup = BeautifulSoup(page.content, 'html.parser')
images = fetch_images(soup, url)
images = filter_images(images, type_)
save(images, format_)
def fetch_images(soup, base_url):
# Works only with relative src paths.
images = []
for img in soup.findAll('img'):
src = img.get('src')
img_url = f'{base_url}/{src}'
name = img_url.split('/')[-1]
images.append(dict(name=name, url=img_url))
return images
def filter_images(images, type_):
if type_ == 'all':
return images
ext_map = {
'png': ['.png'],
'jpg': ['.jpg', '.jpeg'],
}
return [
img for img in images
if matches_extension(img['name'], ext_map[type_])
]
def matches_extension(filename, extension_list):
extension = Path(filename.lower()).suffix
return extension in extension_list
def save(images, format_):
if images:
if format_ == 'img':
save_images(images)
else:
save_json(images)
print('Done')
else:
print('No images to save.')
def save_images(images):
for img in images:
img_data = requests.get(img['url']).content
with open(img['name'], 'wb') as f:
f.write(img_data)
def save_json(images):
data = {}
for img in images:
img_data = requests.get(img['url']).content
b64_img_data = base64.b64encode(img_data)
str_img_data = b64_img_data.decode('utf-8')
data[img['name']] = str_img_data
with open('images.json', 'w') as ijson:
ijson.write(json.dumps(data))
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description='Scrape a webpage.')
parser.add_argument(
'-t',
'--type',
choices=['all', 'png', 'jpg'],
default='all',
help='The image type we want to scrape.')
parser.add_argument(
'-f',
'--format',
choices=['img', 'json'],
default='img',
help='The format images are saved to.')
parser.add_argument(
'url',
help='The URL we want to scrape for images.')
args = parser.parse_args()
scrape(args.url, args.format, args.type)