From ce023f992815eb5a2f4f62fbd7d9d472e41ccaf4 Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Mon, 13 Jul 2020 22:50:49 -0400 Subject: [PATCH 01/15] Allows list downloading and tracking External file that has arguments for importing a list of URL's as well as downloading a list in the queue/cache. After downloading an image it sets the download flag to true - so if the script breaks in middle it will just pick up from where it left off. Some measures here are redundent and can be consolidated further. A measure is in place to keep the zoom level within JPEG limits. Small sleep included so we (hopefully) don't hit 429 errors. --- autodownload.py | 112 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 autodownload.py diff --git a/autodownload.py b/autodownload.py new file mode 100644 index 0000000..06635f2 --- /dev/null +++ b/autodownload.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python3 +# coding: utf-8 +import asyncio +import tile_fetch +import sys +import pandas as pd +from random import randint +from time import sleep + +def main(): + import argparse + + parser = argparse.ArgumentParser(description='Google Arts & Culture website downloader') + parser.add_argument('-a','--add_url', type=str, nargs='?', help='Add new Arts & Culture URLs.', + action='store', dest='url') + parser.add_argument('-z','--zoom', type=int, nargs=1, + help='Zoom level to fetch, can be negative. Will print zoom levels if omitted') + parser.add_argument('-q','--quality', type=int, nargs='?', default=90, + help='Compression level from 0-95. Higher is better.') + parser.add_argument('-d', '--download', help="Downloads all remaining links in queue.",action="store_true", default=False) + parser.add_argument('-b', '--batch-add', type=str, nargs=1, help="Adds a list of URL's to the queue from a csv file of URLs.", action="store", dest='csv') + args = parser.parse_args() + + df = None + try: + df = pd.read_csv("dlcache", index_col=0) + except: + print("No cache found. Setting up a new one.") + df = pd.DataFrame(columns=['url', 'quality', 'downloaded']) + + if args.csv: + url_df = pd.read_csv(args.csv[0]) + for u in url_df['url']: + print("######### Processing '{}'".format(u)) + + img_id = u[-(len(u)-u.rfind("/")-1):] + print(img_id) + + if not (img_id in df.index): + assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95" + df.loc[img_id] = {'url':u, 'quality':args.quality, "downloaded":False} + print("######### Added to queue.") + else: + print("Image already in list. Ignoring the URL.") + + if args.url: + print("######### Processing '{}'".format(args.url)) + u = args.url + img_id = u[-(len(u)-u.rfind("/")-1):] + if not (img_id in df.index): + df.loc[image_info.image_id] = {'url':args.url, 'quality':args.quality, "downloaded":False} + print("######### Added to queue.") + else: + print("Image already in list. Ignoring the URL.") + + if args.download: + print("######### Starting download") + for row in df.loc[df['downloaded'] == False].iterrows(): + print(row[1]['url']) + image_info = None + + try: + image_info = tile_fetch.ImageInfo(row[1]['url']) + except: + print("Invalid url.") + valid_url = False + + assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95" + + if image_info: + if args.zoom: + zoom = args.zoom + try: + assert 0 <= zoom < len(image_info.tile_info) + except: + print('No valid zoom level.') + else: + zoom = len(image_info.tile_info)-1 + print("Defaulting to highest zoom level ({}).".format(zoom)) + + ## Ensuring image resolution fits in JPEG - two pass + if image_info.tile_info[zoom].size[0] > 65535 or image_info.tile_info[zoom].size[1] > 65535: + print( + 'Zoom level {r} too high for JPEG output, using next zoom level {next_zoom} instead'.format( + r=zoom, + next_zoom=zoom-1) + ) + zoom = zoom-1 + + if image_info.tile_info[zoom].size[0] > 65535 or image_info.tile_info[zoom].size[1] > 65535: + print( + 'Zoom level {r} *still* too high for JPEG output, using next zoom level {next_zoom} instead'.format( + r=zoom, + next_zoom=zoom-1) + ) + zoom = zoom-1 + + print("Using zoom level {}.".format(zoom)) + + + coro = tile_fetch.load_tiles(image_info, zoom, image_info.image_name, row[1]['quality']) + loop = asyncio.get_event_loop() + loop.run_until_complete(coro) + df.at[image_info.image_id, "downloaded"] = True + print("Download successful. Sleeping before next download...") + sleep(randint(30,40)) + print("######### Finished download") + + df.to_csv('dlcache') + +if __name__ == '__main__': + main() \ No newline at end of file From 88a12fd8192fb0bd8b17f25307bed26b75724cdd Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Mon, 13 Jul 2020 23:03:21 -0400 Subject: [PATCH 02/15] Add embed metadata, and revise filename Revising the filename for sorting on artist, and chronological order. While author name is usually in the URL along with the art name, it's impossible to separate the two by code. Hence reliance on metadata tags to grab author (and date of painting). Would need a backup plan when it can't find author in the data - though it shouls still be named fine. Cases where author name is not in the URL, however, may cause incorrectly truncating the image name. The embed metadata can be viewed by dropping the final image into exiftool(-k) and so forth. --- tile_fetch.py | 41 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/tile_fetch.py b/tile_fetch.py index 1ddcfec..27f3a52 100755 --- a/tile_fetch.py +++ b/tile_fetch.py @@ -8,6 +8,7 @@ import re import shutil import string +import unidecode import urllib.parse import urllib.request from pathlib import Path @@ -15,6 +16,7 @@ import aiohttp from PIL import Image from lxml import etree +from pyexiv2 import Image as TaggedImage import async_tile_fetcher from decryption import decrypt @@ -42,6 +44,14 @@ class ImageInfo(object): def __init__(self, url): page_source = urllib.request.urlopen(url).read() + self.metadata = {'Xmp.xmp.URL': url} + for item in html.fromstring(page_source).cssselect('[id^="metadata"] li'): + text = item.text_content() + # XMP metadata needs to be under the Xmp.xml section + # removes and non-word character from the title as they invalid for metadata tag names + key = 'Xmp.xmp.' + re.sub(r'\W', '', text[:text.find(':')]) + self.metadata[key] = text[text.find(':') + 1:].strip() + match = self.RE_URL_PATH_TOKEN.search(page_source) if match is None: raise ValueError("Unable to find google arts image token") @@ -51,7 +61,8 @@ def __init__(self, url): self.token = token or b'' url_path = urllib.parse.unquote_plus(urllib.parse.urlparse(url).path) self.image_slug, image_id = url_path.split('/')[-2:] - self.image_name = '%s - %s' % (string.capwords(self.image_slug.replace("-"," ")), image_id) + self.image_name = unidecode.unidecode(string.capwords(self.image_slug.replace("-"," "))) + self.image_id = image_id meta_info_url = "https:{}=g".format(url_no_proto.decode('utf8')) meta_info_tree = etree.fromstring(urllib.request.urlopen(meta_info_url).read()) @@ -137,9 +148,31 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): tile_img = Image.open(io.BytesIO(clear_bytes)) img.paste(tile_img, (x * info.tile_width, y * info.tile_height)) - print("Downloaded all tiles. Saving...") - final_image_filename = outfile or (info.image_name + '.jpg') - img.save(final_image_filename, quality=quality, subsampling=0) + ## Try to extract author name ("Creator"/"Painter") and date ("Date Created"/"Date") from metadata + author = "0" + date = "" + for key, value in info.metadata.items(): + if key.lower() == "xmp.xmp.creator" or key.lower() == "xmp.xmp.painter": + # Avoiding non-ASCII characters in the painter/creator name + author = unidecode.unidecode(value) + elif key.lower() == "xmp.xmp.date" or key.lower() == "xmp.xmp.datecreated": + # Avoiding "/" in the date (year), especially when multiple dates are given + date = value.replace('/','-') + + # Taking out the author's name from the image name - authors name is appended later + modified_image_name = info.image_name[0:len(info.image_name)-len(author)-1] + + final_image_filename = (author + ' - ' + date + ' - ' + modified_image_name + ' - ' +info.image_id + '.jpg') + img.save(final_image_filename, quality=quality, subsampling=0, optimize=True) + xmp_file_obj = TaggedImage(final_image_filename) + + # writes key:value one at a time, which is heavier on writes, + # but far more robust. + for key, value in info.metadata.items(): + try: + xmp_file_obj.modify_xmp({key: value}) + except RuntimeError: + print(f'Failed to add add XMP tag with key "{key}" with value "{value}"') shutil.rmtree(tiles_dir) print("Saved the result as " + final_image_filename) From 5cff461f6309cd0e606ff2242c7c9785be5d7a8f Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Mon, 13 Jul 2020 23:05:21 -0400 Subject: [PATCH 03/15] Defaulting to zoom levels that fit in JPEG --- tile_fetch.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tile_fetch.py b/tile_fetch.py index 27f3a52..3e111e0 100755 --- a/tile_fetch.py +++ b/tile_fetch.py @@ -125,6 +125,23 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): ) z = len(info.tile_info) - 1 + ## Ensuring image resolution fits in JPEG - two pass + if info.tile_info[z].size[0] > 65535 or info.tile_info[z].size[1] > 65535: + print( + 'Zoom level {r} too high for JPEG output, using next zoom level {next_z} instead'.format( + r=z, + next_z=z-1) + ) + z = z-1 + + if info.tile_info[z].size[0] > 65535 or info.tile_info[z].size[1] > 65535: + print( + 'Zoom level {r} *still* too high for JPEG output, using next zoom level {next_z} instead'.format( + r=z, + next_z=z-1) + ) + z = z-1 + z %= len(info.tile_info) # keep 0 <= z < len(tile_info) level = info.tile_info[z] From d08f19a154a41a546a618cdba1fbad801e4a56f5 Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Mon, 13 Jul 2020 23:06:40 -0400 Subject: [PATCH 04/15] Small forgotten changes --- tile_fetch.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tile_fetch.py b/tile_fetch.py index 3e111e0..055f292 100755 --- a/tile_fetch.py +++ b/tile_fetch.py @@ -15,7 +15,7 @@ import aiohttp from PIL import Image -from lxml import etree +from lxml import etree, html from pyexiv2 import Image as TaggedImage import async_tile_fetcher @@ -165,6 +165,8 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): tile_img = Image.open(io.BytesIO(clear_bytes)) img.paste(tile_img, (x * info.tile_width, y * info.tile_height)) + print("Downloaded all tiles. Saving...") + ## Try to extract author name ("Creator"/"Painter") and date ("Date Created"/"Date") from metadata author = "0" date = "" @@ -192,8 +194,7 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): print(f'Failed to add add XMP tag with key "{key}" with value "{value}"') shutil.rmtree(tiles_dir) print("Saved the result as " + final_image_filename) - - + def main(): import argparse From 521c0151b01d3aa74ea007984b850a79c8697fc3 Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Mon, 13 Jul 2020 23:08:53 -0400 Subject: [PATCH 05/15] Requirement updates --- requirements.txt | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 145d4d3..46aeaa5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,9 @@ pycryptodome -lxml -Pillow -aiohttp \ No newline at end of file +lxml~=4.5.1 +Pillow~=7.1.2 +aiohttp~=3.6.2 +pyexiv2~=2.2.0 +cssselect +unidecode +pandas +html \ No newline at end of file From 0c7144a82affd7a793c50dd452cad97088855e2e Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Wed, 15 Jul 2020 13:28:42 -0400 Subject: [PATCH 06/15] Integrating autodownload into main file, PNG output, other changes --- autodownload.py | 112 -------------------------- requirements.txt | 3 - tile_fetch.py | 200 ++++++++++++++++++++++++++++++++++++----------- 3 files changed, 154 insertions(+), 161 deletions(-) delete mode 100644 autodownload.py diff --git a/autodownload.py b/autodownload.py deleted file mode 100644 index 06635f2..0000000 --- a/autodownload.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python3 -# coding: utf-8 -import asyncio -import tile_fetch -import sys -import pandas as pd -from random import randint -from time import sleep - -def main(): - import argparse - - parser = argparse.ArgumentParser(description='Google Arts & Culture website downloader') - parser.add_argument('-a','--add_url', type=str, nargs='?', help='Add new Arts & Culture URLs.', - action='store', dest='url') - parser.add_argument('-z','--zoom', type=int, nargs=1, - help='Zoom level to fetch, can be negative. Will print zoom levels if omitted') - parser.add_argument('-q','--quality', type=int, nargs='?', default=90, - help='Compression level from 0-95. Higher is better.') - parser.add_argument('-d', '--download', help="Downloads all remaining links in queue.",action="store_true", default=False) - parser.add_argument('-b', '--batch-add', type=str, nargs=1, help="Adds a list of URL's to the queue from a csv file of URLs.", action="store", dest='csv') - args = parser.parse_args() - - df = None - try: - df = pd.read_csv("dlcache", index_col=0) - except: - print("No cache found. Setting up a new one.") - df = pd.DataFrame(columns=['url', 'quality', 'downloaded']) - - if args.csv: - url_df = pd.read_csv(args.csv[0]) - for u in url_df['url']: - print("######### Processing '{}'".format(u)) - - img_id = u[-(len(u)-u.rfind("/")-1):] - print(img_id) - - if not (img_id in df.index): - assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95" - df.loc[img_id] = {'url':u, 'quality':args.quality, "downloaded":False} - print("######### Added to queue.") - else: - print("Image already in list. Ignoring the URL.") - - if args.url: - print("######### Processing '{}'".format(args.url)) - u = args.url - img_id = u[-(len(u)-u.rfind("/")-1):] - if not (img_id in df.index): - df.loc[image_info.image_id] = {'url':args.url, 'quality':args.quality, "downloaded":False} - print("######### Added to queue.") - else: - print("Image already in list. Ignoring the URL.") - - if args.download: - print("######### Starting download") - for row in df.loc[df['downloaded'] == False].iterrows(): - print(row[1]['url']) - image_info = None - - try: - image_info = tile_fetch.ImageInfo(row[1]['url']) - except: - print("Invalid url.") - valid_url = False - - assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95" - - if image_info: - if args.zoom: - zoom = args.zoom - try: - assert 0 <= zoom < len(image_info.tile_info) - except: - print('No valid zoom level.') - else: - zoom = len(image_info.tile_info)-1 - print("Defaulting to highest zoom level ({}).".format(zoom)) - - ## Ensuring image resolution fits in JPEG - two pass - if image_info.tile_info[zoom].size[0] > 65535 or image_info.tile_info[zoom].size[1] > 65535: - print( - 'Zoom level {r} too high for JPEG output, using next zoom level {next_zoom} instead'.format( - r=zoom, - next_zoom=zoom-1) - ) - zoom = zoom-1 - - if image_info.tile_info[zoom].size[0] > 65535 or image_info.tile_info[zoom].size[1] > 65535: - print( - 'Zoom level {r} *still* too high for JPEG output, using next zoom level {next_zoom} instead'.format( - r=zoom, - next_zoom=zoom-1) - ) - zoom = zoom-1 - - print("Using zoom level {}.".format(zoom)) - - - coro = tile_fetch.load_tiles(image_info, zoom, image_info.image_name, row[1]['quality']) - loop = asyncio.get_event_loop() - loop.run_until_complete(coro) - df.at[image_info.image_id, "downloaded"] = True - print("Download successful. Sleeping before next download...") - sleep(randint(30,40)) - print("######### Finished download") - - df.to_csv('dlcache') - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 46aeaa5..c0ac8b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,4 @@ lxml~=4.5.1 Pillow~=7.1.2 aiohttp~=3.6.2 pyexiv2~=2.2.0 -cssselect -unidecode pandas -html \ No newline at end of file diff --git a/tile_fetch.py b/tile_fetch.py index 055f292..6cf8189 100755 --- a/tile_fetch.py +++ b/tile_fetch.py @@ -11,6 +11,10 @@ import unidecode import urllib.parse import urllib.request +import pandas as pd +from random import randint +from time import sleep + from pathlib import Path import aiohttp @@ -125,26 +129,14 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): ) z = len(info.tile_info) - 1 - ## Ensuring image resolution fits in JPEG - two pass - if info.tile_info[z].size[0] > 65535 or info.tile_info[z].size[1] > 65535: - print( - 'Zoom level {r} too high for JPEG output, using next zoom level {next_z} instead'.format( - r=z, - next_z=z-1) - ) - z = z-1 - - if info.tile_info[z].size[0] > 65535 or info.tile_info[z].size[1] > 65535: - print( - 'Zoom level {r} *still* too high for JPEG output, using next zoom level {next_z} instead'.format( - r=z, - next_z=z-1) - ) - z = z-1 - z %= len(info.tile_info) # keep 0 <= z < len(tile_info) level = info.tile_info[z] + PNG_Output = 0 + if info.tile_info[z].size[0] > 65535 or info.tile_info[z].size[1] > 65535: + PNG_Output = 1 + print(level.size) + print(z) img = Image.new(mode="RGB", size=level.size) tiles_dir = Path(info.image_name) @@ -167,7 +159,7 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): print("Downloaded all tiles. Saving...") - ## Try to extract author name ("Creator"/"Painter") and date ("Date Created"/"Date") from metadata + ## Try to extract author name ("Creator"/"Painter") and date ("Date Created"/"Date") from metadata author = "0" date = "" for key, value in info.metadata.items(): @@ -180,21 +172,40 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): # Taking out the author's name from the image name - authors name is appended later modified_image_name = info.image_name[0:len(info.image_name)-len(author)-1] - - final_image_filename = (author + ' - ' + date + ' - ' + modified_image_name + ' - ' +info.image_id + '.jpg') - img.save(final_image_filename, quality=quality, subsampling=0, optimize=True) - xmp_file_obj = TaggedImage(final_image_filename) - # writes key:value one at a time, which is heavier on writes, - # but far more robust. - for key, value in info.metadata.items(): + if PNG_Output == 1: + if author == 0: + final_image_filename = (info.image_name + '.png') + else: + final_image_filename = (author + ' - ' + date + ' - ' + modified_image_name + ' - ' +info.image_id + '.png') + ## Optimize=True for PNG attempts the highest level of lossless compression possible. + img.save(final_image_filename, optimize=True) + else: + if author == 0: + final_image_filename = (info.image_name + '.jpg') + else: + final_image_filename = (author + ' - ' + date + ' - ' + modified_image_name + ' - ' +info.image_id + '.jpg') + ## Optimize = True for JPEG breaks ("Suspension not allowed here" error) if quality is 95 and the file is large enough - from what I can test anyway. + img.save(final_image_filename, quality=quality, subsampling=0, optimize=True if quality < 95) + + xmp_file_obj = TaggedImage(final_image_filename) + if PNG_Output == 0: try: - xmp_file_obj.modify_xmp({key: value}) - except RuntimeError: - print(f'Failed to add add XMP tag with key "{key}" with value "{value}"') + xmp_file_obj.modify_xmp(info.metadata) + except: + print("Cannot write all metadata at once; writing tag by tag...") + # writes key:value one at a time, which is heavier on writes, + # but far more robust. + for key, value in info.metadata.items(): + try: + xmp_file_obj.modify_xmp({key: value}) + except RuntimeError: + print(f'Failed to add add XMP tag with key "{key}" with value "{value}"') + print(repr(e)) shutil.rmtree(tiles_dir) print("Saved the result as " + final_image_filename) + def main(): import argparse @@ -205,29 +216,126 @@ def main(): parser.add_argument('--outfile', type=str, nargs='?', help='The name of the file to create.') parser.add_argument('--quality', type=int, nargs='?', default=90, - help='Compression level from 0-95. Higher is better.') + help='Compression level from 0-95. Higher is better quality, larger file size.') + parser.add_argument('-a','--add_url', type=str, nargs='?', help='Add a new URL to the queue.', + action='store', dest='add_url') + parser.add_argument('-b', '--batch-add', type=str, nargs=1, help="Adds a list of URL's to the queue from a csv file.", action="store", dest='csv') + parser.add_argument('-d', '--download', help="Downloads all remaining links in the queue.",action="store_true", default=None) args = parser.parse_args() assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95" - url = args.url or input("Enter the url of the image: ") - - print("Downloading image meta-information...") - image_info = ImageInfo(url) - zoom = args.zoom - if zoom is None: - print(image_info) - while True: + if args.csv or args.add_url or args.download: + df = None + try: + df = pd.read_csv("dlcache", index_col=0) + except: + print("No cache found. Setting up a new one.") + df = pd.DataFrame(columns=['url', 'quality', 'downloaded']) + + if args.csv: + url_df = pd.read_csv(args.csv[0]) + for u in url_df['url']: + print("######### Processing '{}'".format(u)) + img_id = u[-(len(u)-u.rfind("/")-1):] + + if not (img_id in df.index): + assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95" + df.loc[img_id] = {'url':u, 'quality':args.quality, "downloaded":False} + print("######### Added to queue.") + else: + print("Image already in list. Ignoring the URL.") + + if args.add_url: + print("######### Processing '{}'".format(args.add_url)) + u = args.add_url + img_id = u[-(len(u)-u.rfind("/")-1):] + if not (img_id in df.index): + df.loc[img_id] = {'url':args.add_url, 'quality':args.quality, "downloaded":False} + print("######### Added to queue.") + else: + print("Image already in list. Ignoring the URL.") + + if args.download: + print("######### Starting download") + for row in df.loc[df['downloaded'] == False].iterrows(): + print(row[1]['url']) + img_info = None + try: - zoom = int(input("Which level do you want to download? ")) - assert 0 <= zoom < len(image_info.tile_info) - break - except (ValueError, AssertionError): - print("Not a valid zoom level.") - - coro = load_tiles(image_info, zoom, args.outfile, args.quality) - loop = asyncio.get_event_loop() - loop.run_until_complete(coro) + img_info = ImageInfo(row[1]['url']) + except: + print("Invalid url.") + valid_url = False + + #if args.quality is None: - maybe add handling for overwriting quality in batch file? + assert 0 <= ImageInfo(row[1]['quality']) <= 95, "Image quality must be between 0 and 95" + + if img_info: + if args.zoom: + zoom = args.zoom + try: + assert 0 <= zoom < len(img_info.tile_info) + except: + print('No valid zoom level.') + else: + zoom = len(img_info.tile_info)-1 + print("Defaulting to highest zoom level ({}).".format(zoom)) + + ## Ensuring image resolution fits in JPEG + if img_info.tile_info[zoom].size[0] > 65535 or img_info.tile_info[zoom].size[1] > 65535: + print( + 'Zoom level {r} too high for JPEG output, using next zoom level {next_zoom} instead'.format( + r=zoom, + next_zoom=zoom-1) + ) + zoom = zoom-1 + print("Using zoom level {}.".format(zoom)) + + coro = load_tiles(img_info, zoom, img_info.image_name, row[1]['quality']) + loop = asyncio.get_event_loop() + loop.run_until_complete(coro) + print(img_info.image_id) + try: + df.at[img_info.image_id, 'downloaded'] = True + except: + print("Archive recording not successful") + print("Download successful. Sleeping before next download...") + sleep(randint(30,40)) + print("######### Finished download") + df.to_csv('dlcache') + + if args.csv is None and args.add_url is None and args.download is None: + url = args.url or input("Enter the url of the image: ") + + print("Downloading image meta-information...") + image_info = ImageInfo(url) + + zoom = args.zoom + if zoom is None: + print(image_info) + while True: + try: + zoom = int(input("Which level do you want to download? Choose 11 to default to largest JPEG-compliant level: ")) + if zoom == 11: + ## Ensuring image resolution fits in JPEG. Otherwise, image will be saved as PNG, which does not have max resolution limits (but does not allow for metadata embedding). + zoom = len(img_info.tile_info)-1 + while image_info.tile_info[zoom].size[0] > 65535 or image_info.tile_info[zoom].size[1] > 65535: + print( + 'Zoom level {r} too high for JPEG output, using next zoom level {next_zoom} instead'.format( + r=zoom, + next_zoom=zoom-1) + ) + zoom = zoom-1 + else: + assert 0 <= zoom < len(image_info.tile_info) + break + except (ValueError, AssertionError): + print("Not a valid zoom level.") + + coro = load_tiles(image_info, zoom, args.outfile, args.quality) + loop = asyncio.get_event_loop() + loop.run_until_complete(coro) if __name__ == '__main__': From f570a2a0ade763ee860e00cb5d5844d0fd3bfc9c Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Wed, 15 Jul 2020 14:33:16 -0400 Subject: [PATCH 07/15] Update conditional --- tile_fetch.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tile_fetch.py b/tile_fetch.py index 6cf8189..032d012 100755 --- a/tile_fetch.py +++ b/tile_fetch.py @@ -135,8 +135,7 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): PNG_Output = 0 if info.tile_info[z].size[0] > 65535 or info.tile_info[z].size[1] > 65535: PNG_Output = 1 - print(level.size) - print(z) + img = Image.new(mode="RGB", size=level.size) tiles_dir = Path(info.image_name) @@ -186,7 +185,10 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): else: final_image_filename = (author + ' - ' + date + ' - ' + modified_image_name + ' - ' +info.image_id + '.jpg') ## Optimize = True for JPEG breaks ("Suspension not allowed here" error) if quality is 95 and the file is large enough - from what I can test anyway. - img.save(final_image_filename, quality=quality, subsampling=0, optimize=True if quality < 95) + if quality < 95: + img.save(final_image_filename, quality=quality, subsampling=0, optimize=True) + else: + img.save(final_image_filename, quality=quality, subsampling=0) xmp_file_obj = TaggedImage(final_image_filename) if PNG_Output == 0: From 13b86bf48f1d6c93f6e8395ac41dc9e26eb9760a Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Wed, 15 Jul 2020 17:27:24 -0400 Subject: [PATCH 08/15] Update requirements.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index c0ac8b9..e69dc24 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ Pillow~=7.1.2 aiohttp~=3.6.2 pyexiv2~=2.2.0 pandas +unidecode From cffddaba03d91afc6b06e32e9e860e83d49b1677 Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Wed, 15 Jul 2020 17:30:48 -0400 Subject: [PATCH 09/15] Revert "Update requirements.txt" This reverts commit 13b86bf48f1d6c93f6e8395ac41dc9e26eb9760a. --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e69dc24..c0ac8b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,3 @@ Pillow~=7.1.2 aiohttp~=3.6.2 pyexiv2~=2.2.0 pandas -unidecode From 4bbf4bfcd5ab06fc945893140515263764abe73f Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Wed, 15 Jul 2020 21:24:18 -0400 Subject: [PATCH 10/15] Update tile_fetch.py --- tile_fetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tile_fetch.py b/tile_fetch.py index 032d012..173d272 100755 --- a/tile_fetch.py +++ b/tile_fetch.py @@ -285,7 +285,7 @@ def main(): print("Defaulting to highest zoom level ({}).".format(zoom)) ## Ensuring image resolution fits in JPEG - if img_info.tile_info[zoom].size[0] > 65535 or img_info.tile_info[zoom].size[1] > 65535: + while img_info.tile_info[zoom].size[0] > 65535 or img_info.tile_info[zoom].size[1] > 65535: print( 'Zoom level {r} too high for JPEG output, using next zoom level {next_zoom} instead'.format( r=zoom, From c458710d23322dd799fd48271e42e624c5357631 Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Sun, 25 Jul 2021 15:15:46 -0400 Subject: [PATCH 11/15] Updated author/date created source Instead of taking directly from the metadata, this pulls the author/date created from the page source. Cleaner and more reliable. --- tile_fetch.py | 226 +++++++++++++++++++++++++++++++------------------- 1 file changed, 142 insertions(+), 84 deletions(-) diff --git a/tile_fetch.py b/tile_fetch.py index 173d272..1f50e20 100755 --- a/tile_fetch.py +++ b/tile_fetch.py @@ -18,6 +18,7 @@ from pathlib import Path import aiohttp +import html as html_1 from PIL import Image from lxml import etree, html from pyexiv2 import Image as TaggedImage @@ -41,13 +42,39 @@ def compute_url(path, token, x, y, z): url_bytes = b'https://lh3.googleusercontent.com/%s=x%d-y%d-z%d-t%s' % (path, x, y, z, signature) return url_bytes.decode('utf-8') +def remove(value, deletechars): + for c in deletechars: + value = value.replace(c,'') + return value; class ImageInfo(object): RE_URL_PATH_TOKEN = re.compile(rb']\r?\n,"(//[^"/]+/[^"/]+)",(?:"([^"]+)"|null)', re.MULTILINE) def __init__(self, url): page_source = urllib.request.urlopen(url).read() - + ## Finding author and title from outside the metadata - this will be a cleaner way to title the image output + page_source_html = html.fromstring(page_source) + page_source_bytes = etree.tostring(page_source_html) + page_source_str = page_source_bytes.decode('utf-8') + artist_search_str = "categoryId=artist\">" + title_search_str = "UEmoBd(FQhpwf);R7KM6d:r1oohb;\" data-title=\"" + title_search_end = "\" data-galabel=\"asset-viewer" + page_source_str_artist = page_source_str.split(artist_search_str,9) + page_source_str_title = page_source_str.split(title_search_str,9) + self.page_source_artist = "" + try: + page_source_artist = unidecode.unidecode(html_1.unescape(page_source_str_artist[1].split("<",1)[0])) + self.page_source_artist = remove(page_source_artist, '\/:*?"<>|') + self.source_artist_flag = 1 + except: + self.source_artist_flag = 0 + page_source_title = unidecode.unidecode(html_1.unescape(page_source_str_title[1].split(title_search_end,1)[0])) + self.page_source_title = remove(page_source_title, '\/:*?"<>|') + self.page_source_title = self.page_source_title[:230] + while self.page_source_title[-1:] == "." or self.page_source_title[-1:] == " ": + self.page_source_title = self.page_source_title.rstrip(".") + self.page_source_title = self.page_source_title.rstrip(" ") + self.metadata = {'Xmp.xmp.URL': url} for item in html.fromstring(page_source).cssselect('[id^="metadata"] li'): text = item.text_content() @@ -55,7 +82,7 @@ def __init__(self, url): # removes and non-word character from the title as they invalid for metadata tag names key = 'Xmp.xmp.' + re.sub(r'\W', '', text[:text.find(':')]) self.metadata[key] = text[text.find(':') + 1:].strip() - + match = self.RE_URL_PATH_TOKEN.search(page_source) if match is None: raise ValueError("Unable to find google arts image token") @@ -65,8 +92,16 @@ def __init__(self, url): self.token = token or b'' url_path = urllib.parse.unquote_plus(urllib.parse.urlparse(url).path) self.image_slug, image_id = url_path.split('/')[-2:] - self.image_name = unidecode.unidecode(string.capwords(self.image_slug.replace("-"," "))) + # self.image_name = unidecode.unidecode(string.capwords(self.image_slug.replace("-"," "))) + # self.image_name = self.image_name.replace(":","-") + # self.image_name = self.image_name.replace("\"","\'") + # self.image_name = unidecode.unidecode(self.image_name) + # self.image_name = remove(self.image_name, '\/:*?"<>|') + # self.image_name = self.image_name[:250] + self.image_name = self.page_source_title self.image_id = image_id + + # self.image_name_2 = '%s - %s' % (string.capwords(self.image_slug.replace("-"," ")), image_id) meta_info_url = "https:{}=g".format(url_no_proto.decode('utf8')) meta_info_tree = etree.fromstring(urllib.request.urlopen(meta_info_url).read()) @@ -86,7 +121,12 @@ def __repr__(self): '\n'.join(map(str, self.tile_info)) ) - + def remove(value, deletechars): + for c in deletechars: + value = value.replace(c,'') + return value; + + class ZoomLevelInfo(object): def __init__(self, img_info, level_num, attrs): self.num = level_num @@ -120,6 +160,7 @@ async def fetch_tile(session, image_info, tiles_dir, x, y, z): async def load_tiles(info, z=-1, outfile=None, quality=90): + if z >= len(info.tile_info): print( 'Invalid zoom level {z}. ' @@ -135,10 +176,10 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): PNG_Output = 0 if info.tile_info[z].size[0] > 65535 or info.tile_info[z].size[1] > 65535: PNG_Output = 1 - + img = Image.new(mode="RGB", size=level.size) - - tiles_dir = Path(info.image_name) + tiles_dir = Path(info.page_source_title) + tiles_dir.mkdir(exist_ok=True) async with aiohttp.ClientSession() as session: @@ -150,7 +191,7 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): ] print("Downloading tiles...") tiles = await async_tile_fetcher.gather_progress(awaitable_tiles) - + for x, y, encrypted_bytes in tiles: clear_bytes = decrypt(encrypted_bytes) tile_img = Image.open(io.BytesIO(clear_bytes)) @@ -162,50 +203,59 @@ async def load_tiles(info, z=-1, outfile=None, quality=90): author = "0" date = "" for key, value in info.metadata.items(): - if key.lower() == "xmp.xmp.creator" or key.lower() == "xmp.xmp.painter": - # Avoiding non-ASCII characters in the painter/creator name - author = unidecode.unidecode(value) - elif key.lower() == "xmp.xmp.date" or key.lower() == "xmp.xmp.datecreated": - # Avoiding "/" in the date (year), especially when multiple dates are given - date = value.replace('/','-') + if info.source_artist_flag == 0: + if key.lower() == "xmp.xmp.creator" or key.lower() == "xmp.xmp.painter" or key.lower() == "xmp.xmp.illustrator": + # Avoiding non-ASCII characters in the painter/creator name. This runs if artist name could not be found from the page source + author = unidecode.unidecode(value) + author = author.replace("?","") + author = author.replace("\/","-") + author = author.replace("/","-") + author = author.replace("|","-") + author = author.replace("\\","-") + author = author.replace(":","-") + author = author.replace('"','') + author = author.replace('[','(') + author = author.replace(']',')') + author = author.replace("\n"," ") + author = author.replace("*"," ") + author = author.replace('<','') + info.page_source_artist = author[:30] + if key.lower() == "xmp.xmp.date" or key.lower() == "xmp.xmp.datecreated": + # Avoiding "/" in the date (year), especially when multiple dates are given. Not deprecating with author as date is often missing from the page source + date = unidecode.unidecode(value) + date = date.replace('/','-') + date = date.replace('?','') + date = date.replace('\\','') + date = date.replace("\n"," ") + date = date.replace(':','') + date = date.replace('[','(') + date = date.replace(']',')') + date = date.replace(';',',') + date = date.replace('"','') + date = date[:25] - # Taking out the author's name from the image name - authors name is appended later - modified_image_name = info.image_name[0:len(info.image_name)-len(author)-1] - - if PNG_Output == 1: - if author == 0: - final_image_filename = (info.image_name + '.png') - else: - final_image_filename = (author + ' - ' + date + ' - ' + modified_image_name + ' - ' +info.image_id + '.png') - ## Optimize=True for PNG attempts the highest level of lossless compression possible. - img.save(final_image_filename, optimize=True) - else: - if author == 0: - final_image_filename = (info.image_name + '.jpg') - else: - final_image_filename = (author + ' - ' + date + ' - ' + modified_image_name + ' - ' +info.image_id + '.jpg') - ## Optimize = True for JPEG breaks ("Suspension not allowed here" error) if quality is 95 and the file is large enough - from what I can test anyway. - if quality < 95: - img.save(final_image_filename, quality=quality, subsampling=0, optimize=True) - else: - img.save(final_image_filename, quality=quality, subsampling=0) + if(info.page_source_artist) == "": + info.page_source_artist = "No Author" - xmp_file_obj = TaggedImage(final_image_filename) - if PNG_Output == 0: - try: - xmp_file_obj.modify_xmp(info.metadata) - except: - print("Cannot write all metadata at once; writing tag by tag...") - # writes key:value one at a time, which is heavier on writes, - # but far more robust. - for key, value in info.metadata.items(): - try: - xmp_file_obj.modify_xmp({key: value}) - except RuntimeError: - print(f'Failed to add add XMP tag with key "{key}" with value "{value}"') - print(repr(e)) + title_max_length = 245 - len(info.page_source_artist + ' - ' + date + ' - ' + ' - ' + info.image_id)+3 + alt_image_filename = (info.page_source_artist + ' - ' + date + ' - ' + info.page_source_title[:(title_max_length)] + ' - ' + info.image_id + '.jpg') + img.save(alt_image_filename, quality=90, subsampling=0, optimize=True) + print("Adding metadata...") + xmp_file_obj = TaggedImage(alt_image_filename) + try: + xmp_file_obj.modify_xmp(info.metadata) + except: + print("Cannot write all metadata at once; writing tag by tag...") + # writes key:value one at a time, which is heavier on writes, + # but far more robust. + for key, value in info.metadata.items(): + try: + xmp_file_obj.modify_xmp({key: value}) + except RuntimeError as e: + print(f'Failed to add add XMP tag with key "{key}" with value "{value}"') + print(repr(e)) shutil.rmtree(tiles_dir) - print("Saved the result as " + final_image_filename) + print("Saved the result as " + alt_image_filename) def main(): @@ -224,9 +274,10 @@ def main(): parser.add_argument('-b', '--batch-add', type=str, nargs=1, help="Adds a list of URL's to the queue from a csv file.", action="store", dest='csv') parser.add_argument('-d', '--download', help="Downloads all remaining links in the queue.",action="store_true", default=None) args = parser.parse_args() - + + assert 0 <= args.quality <= 95, "Image quality must be between 0 and 95" - + if args.csv or args.add_url or args.download: df = None try: @@ -259,51 +310,58 @@ def main(): print("Image already in list. Ignoring the URL.") if args.download: + print("######### Starting download") for row in df.loc[df['downloaded'] == False].iterrows(): print(row[1]['url']) img_info = None + #assert 0 <= ImageInfo(row[1]['quality']) <= 95, "Image quality must be between 0 and 95" try: img_info = ImageInfo(row[1]['url']) - except: - print("Invalid url.") - valid_url = False - - #if args.quality is None: - maybe add handling for overwriting quality in batch file? - assert 0 <= ImageInfo(row[1]['quality']) <= 95, "Image quality must be between 0 and 95" - - if img_info: - if args.zoom: - zoom = args.zoom - try: - assert 0 <= zoom < len(img_info.tile_info) - except: - print('No valid zoom level.') - else: - zoom = len(img_info.tile_info)-1 - print("Defaulting to highest zoom level ({}).".format(zoom)) - - ## Ensuring image resolution fits in JPEG - while img_info.tile_info[zoom].size[0] > 65535 or img_info.tile_info[zoom].size[1] > 65535: - print( - 'Zoom level {r} too high for JPEG output, using next zoom level {next_zoom} instead'.format( - r=zoom, - next_zoom=zoom-1) - ) - zoom = zoom-1 - print("Using zoom level {}.".format(zoom)) - - coro = load_tiles(img_info, zoom, img_info.image_name, row[1]['quality']) - loop = asyncio.get_event_loop() - loop.run_until_complete(coro) - print(img_info.image_id) + + if img_info: + if args.zoom: + zoom = args.zoom + try: + assert 0 <= zoom < len(img_info.tile_info) + except: + print('No valid zoom level.') + else: + zoom = len(img_info.tile_info)-1 + print("Defaulting to highest zoom level ({}).".format(zoom)) + + ## Ensuring image resolution fits in JPEG + while img_info.tile_info[zoom].size[0] > 65535 or img_info.tile_info[zoom].size[1] > 65535: + print( + 'Zoom level {r} too high for JPEG output, using next zoom level {next_zoom} instead'.format( + r=zoom, + next_zoom=zoom-1) + ) + zoom = zoom-1 + print("Using zoom level {}.".format(zoom)) + try: + coro = load_tiles(img_info, zoom, img_info.image_name, row[1]['quality']) + loop = asyncio.get_event_loop() + loop.run_until_complete(coro) + except: + try: + coro = load_tiles(img_info, zoom, img_info.image_name, row[1]['quality']) + loop = asyncio.get_event_loop() + loop.run_until_complete(coro) + except: + coro = load_tiles(img_info, zoom, img_info.image_name, row[1]['quality']) + loop = asyncio.get_event_loop() + loop.run_until_complete(coro) + except Exception: + pass try: df.at[img_info.image_id, 'downloaded'] = True + df.to_csv('dlcache') except: print("Archive recording not successful") print("Download successful. Sleeping before next download...") - sleep(randint(30,40)) + sleep(randint(15,20)) print("######### Finished download") df.to_csv('dlcache') From 9601eb9b16e7e0e7b8a74b1f3f38c6c172ea648c Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Sun, 25 Jul 2021 15:21:39 -0400 Subject: [PATCH 12/15] Organizes outputs into artist folders Note - still fails where "!" is contained in the name. Otherwise creates and updates folders with each artist's name. --- Create Artist Folders.bat | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 Create Artist Folders.bat diff --git a/Create Artist Folders.bat b/Create Artist Folders.bat new file mode 100644 index 0000000..73b1143 --- /dev/null +++ b/Create Artist Folders.bat @@ -0,0 +1,16 @@ +@echo off +setlocal EnableExtensions DisableDelayedExpansion +set "SourceDir=C:\Users\XXXX" +set "DestDir=C:\Users\XXXX\Artists3" + +for /F "eol=| delims=" %%A in ('dir /B /A-D-H "%SourceDir%\*-*.jpg" 2^>nul') do ( + set "string=%%~A" + setlocal enabledelayedexpansion + SET "end=!string:* - =!" + FOR /F "delims=" %%G IN ("!end!") do set "begin=!string: - %%~G=!" + md "%DestDir%\!begin!" 2>nul + move /Y "%SourceDir%\%%A" "%DestDir%\!begin!\" + endlocal +) + +endlocal \ No newline at end of file From 5365fcae6411cca098f3c17779e8aba757a47b0b Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Sun, 25 Jul 2021 15:31:08 -0400 Subject: [PATCH 13/15] AHK to acquire links for batch Loops through artist pages and grabs links. Has Chrome plugin dependencies - not plug/play. --- AHK_inputs/Arrow1.PNG | Bin 0 -> 316 bytes AHK_inputs/Discover1.PNG | Bin 0 -> 4324 bytes AHK_inputs/EndPage1.png | Bin 0 -> 175 bytes Arts.ahk | 118 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 118 insertions(+) create mode 100644 AHK_inputs/Arrow1.PNG create mode 100644 AHK_inputs/Discover1.PNG create mode 100644 AHK_inputs/EndPage1.png create mode 100644 Arts.ahk diff --git a/AHK_inputs/Arrow1.PNG b/AHK_inputs/Arrow1.PNG new file mode 100644 index 0000000000000000000000000000000000000000..667b26f41881144bb553fe163b77b00b6eeb9835 GIT binary patch literal 316 zcmeAS@N?(olHy`uVBq!ia0vp^DnKm9!3HE<9_|SSQjEnx?oJHr&dIz4a#+$GeH|GX zHuiJ>Nn{1`ISV`@iy0V%N=IS)Aqv?jjZz^Ln@y@4s$ z;o#zF3G3v(u9!AK%;t8}0tK&tB0-tTi4V@+e6z=4!q3DLUz2C*+@93-R6&3E}iDMzPI9<2caIitpFl6Vgi_C+>UUZ}14{IR;Nx KKbLh*2~7a+b9xy7 literal 0 HcmV?d00001 diff --git a/AHK_inputs/Discover1.PNG b/AHK_inputs/Discover1.PNG new file mode 100644 index 0000000000000000000000000000000000000000..b819eaa653fa79f09c9aae69a12dd405c41232b9 GIT binary patch literal 4324 zcmbtY)mIb%^Ic-4B&3BUm9C|fkfmg4kdP7(SQ;fImQd+#0cmulI|Y}nrCZ4b=_O?8 zjvwD2@H^kb_i$&R)H^ zLa5aj)Q+GGoRj;Go^#XR@j+$<1_iGhv}YlBq5SK>0VFH3%>%0`mXXnvw} z=UdKMMecoXpCKeEg3rB^EQTPl$&#vL|B~-0a4QkGsp>}*dGeRxc=n<_eV>+_O?>|H z^U3YJNHBL+%!E7L{Am4nYEjl1)k}ayP_AB9rSRZa^;Ik|_{H~S?^RD#s7By+o zxYD`7x~%hGZS5IW3&O9yHS7fvl`Pvkn=~(Mt?USoe6E*t4{d5SJGlP(V@VsG%kE|@ zuFg!Rj3yGAbDbaA+h7Dk*Quma{dFr&p1v7K?AV*n4I)XU-eR{KO-CM+WSvU}2o4Vz zdZkWRN1HCD?2}GLDT~-%|y{uW1D&5Rw^=C$_3%nXzTSyI& zMAeoHb1l~1!CqH&W0{O#IU3vVbj%+RlO4exppii&S+Os_{m*Zq>@arMV2I%~6j37Y=#5v(itc(L{?)zRQ| zd{qB1{+xN_NkEOHn$XEGIIC8d(ZQ2QWmIHYqZQ{e7RykP+tXSD>x0B$bNT|5{$*z* z0b!Pq8me@zW?<)Wz%L#aJNKk#XDy#JxKg^tVYr;(!pF$^YhCGz`apK;-lZo?*~?7j zpNlh{Q*(I);398|M58OSUN2IU@QPCU)pSN(JslauT+nh}K#8tScLw z5wCgS4La%7(6j;1Z}m|F0!&Zk`=^OogjxFe4;FAHOz2bH1_jXxS<2t{$)&chn_6}0 ziMPijA$-xOV*LE@~ z_AL^&4SurSY1K@6F5>!e`Pd&AB#D)i$ntmb6?`$Sp8oEZQ6KW4Kg`{9@24sfplH@; zp-O?sg=D0RdN!{AV}49-OsPalf1m=_i*ifqm)f{5O-AK7 z7k8(~E&U|ujg^D__+?A4{tdrgBBHmHxmc(s$}Mm%u|EHR=@do3RyXioFPCSr)uFKD z>4X~cM2ah|Z*)%pTH<3HUJ&=V7cYbr`}&a_#~*_&(F~OoAU=}w+a&A zrrLL#N9X0}Y6eQ5L9OP?bFTED)zOW78MoF_Wa#eof%@iOA!77=e#;5L5k9ucZL$#< zM9R03Kn^v>xG#S%r(3w6h?=l){eyIK+C;(46ggDvfDn(rrjsGfzjRPHyn6oR%c*$Y zG| zN1t#bnCysOM0#{vp0xnnzWSKa4FR)Cj-BzWG`bycJjq61S<@qu*$QkOl{;j;w)^V} zysY0p7CK7{8)7VeznCESs*>QJ{>8jbB8$%Ejnin_2DUu4sW*oh;#=o)l)%4=O_$~6 zm`3DBwhK?iG@pFDK9qoV_fD}_Rs*4T?7wh?BaE`3d_K}6gClEE9smUY4i4Vj8~%ZL zZZ}#naRv%?)sFXF0&w#QkaR5jxJ!1AnKxliTVuR%+Oe}z@9N=GrP>z74C|yL*)ZdA z!`~D9Z*fG`d7DTHG|tz$RxP!~_vkGLqQp1Uu&R(dQ|?@z2>R}UfI6{%WLr5sbjd16 zUc~9DcqAteG;iuG^5VRZKa~wKj9Dvzf?fPf`?93s)9fcSZit$}+Ds$ix zh|j2H_%Yb+NP)NAVV1K5Y+rbwK80AzI1Vyn3k{MbqsvZPlMX!!KHkyjCRNoY-&$bt zqdDVbTlt*Ev}TQ!RDqBHYxtU~415|QSh2QqP|Zp?c{N|>h;M)nKi`WJn;#`LSq}yg z=jxHt(_pEq>Y8VpW5umi39?T=SK0iFS8Q$`-8)jEd|?djp~V!FhY$qx-Pj0j_n&#i zjav017DgYzTVslsX{T}}W6R{O65n3ykU&xWrfJhjDRiF_Ddj;ZStz~qJS=S+#No&7 z!*a`g=c;ftDN*8uZ}rgr*PrMNev!w(Lk}*q`=cPZK0@k%uFxKrzI!Ol?XJ*9(*M#q z$ht;(n37pGL048i#?~>u>kU?|IK9wmB5t^vHJGE%6hsoW=Ie^$5g2@&%0Dhealj9g zpSWs(?^txxqJ=xtdWJM59|JSKP##+LlS(S=b&4)NS{?RMcK`(bI5RzyU zVYA7MJj#o1|Kab6;Pg-D=k4H2WtJ6r_fIPv_Gb9is_%ZdU(q7bq8~ZTnaAOUyOG+< z6*=6P6FgURD-SUAP&Dz8V(d$_#B<3Q4?C@Ix38u85Q9OI;Q}lW32d3(^`VS^SF59l zCFHRhze)3w2jf|5>|X6}mWt-XDmmCRY0O;?P?@iPn7z#yl`6+lkfqTM#26A9Jg9!B zj8>u~DK@3@0Ex>yi&-*v`srpOH%C0#{re=pR?l;h-H-Dec$L>kF!~?>I!F7)KNm_& zeDf>SAi(iQ<*!#BK}5>!Xmuv~WSP*m*knBDl2Ft~x8v?rtc~auQn^NtIo0O+z>g=z zm%GZGihsbF(r}*!72A%30B%-c9J@2Lo_OADKT_PN{K`tQ5pHO|;%jVg$%B=WWEyfFKoC~ADI-m+fCN^WufxKfUjgM_uUk-RWi222Kr z!WD>J>5HuZL|+-)La+~N3 zkkDEoNV)%=R%lr#!qR_HhKjL5wek z*D8^IjRLd1CmFp90YxAK!Ak@?e)&btSEON_G-@=@PHNa?XN5|yuP(+&fVDsDHBPW0 zpEOeO7zM?Q*v`pAl{!YI;Exw680E*SG&_R+igjv}*>eaE zdi%&jLq#dPk*^$Yy-O;o|3ueaUc^619BYZD!pPnMvv#lKsYGzuaF|3&oGfj_l|tZk zwAr0aWCXUfGAn*?I<%|Tmri}D2%b}kfj72^uZv{jYHaV~{f&<6sAUjTD;1yh3yFj) zzZD53oVf>Z!k03~bHr4WQOB$$_EwhR5<_hR#Biac!-L316d6C-@pFD2*^>%*S;YIT zj)@;k<-jHh)aiEd$Bj$kA6Ny73r%;SmkTZWSh$aIb|DsPvLEqHwI;7mR(@KCsqF)- zM~HcQGsU6gsx8S_{jIoi4aMG5am8F%qCm6d;d(!VFT)GliqwsjvY9yPcIohZfWQmP z7N{G<$i-~ZRM>?KnJ+mCe7F$)$}P$f0zp=M==J>_ zT^vU|g`6qo0-hG;75of1%HfeVHy&rtZ6Jx|>G2zLeVW7~s+aNTZ2(YQ**h-|k61E+aXQy0N7Nsb5n=RBPsrv53feG~31&tl^Pck$sMtki^Le)`#<@+GA%{vai1_9Kvt2yLzopr0Q9FYbpQYW literal 0 HcmV?d00001 diff --git a/Arts.ahk b/Arts.ahk new file mode 100644 index 0000000..f5cbefd --- /dev/null +++ b/Arts.ahk @@ -0,0 +1,118 @@ + +; Loops through artist pages and copies links +; Requires "Link grabber" extension +; Requires open excel file to copy links to +; 1080p monitor - additional resolutions will require changing coord +; Starting point - https://artsandculture.google.com/category/artist?tab=az&date=1920&pr=A + +SetWorkingDir %A_ScriptDir% + +Esc::ExitApp + +^j:: +Loop, 1000 +{ + ;Searches for end of page, moves to next letter alphabetically if so + ImageSearch, FoundX, FoundY, 1880, 995, 1935, 1040, C:\Users\XXXX\AHK_inputs\EndPage1.PNG + if (ErrorLevel = 2) + MsgBox Could not conduct the search. + else if (ErrorLevel <> 1) + { + FoundYY = 1 + Send {Home} + sleep, 1000 + ; Loops to the next alphabetical letter + MouseMove, 1075, 378 + Mouseclick + sleep, 2500 + Send {Down 4} + } + +MouseMove, 360, 442, 60 +Send, ^{LButton} +sleep, 600 +Loop, 4 +{ +MouseMove, 300, 0, 10, R +Send, ^{LButton} +sleep, 900 +} +Gosub, Label1 +sleep, 600 +Send {Down 6} +sleep, 1500 +} +return + +Label1: +Loop, 5 +{ +send, ^{tab} +CoordMode Pixel ; Interprets the coordinates below as relative to the screen rather than the active window. +FoundYY = 0 +while FoundYY = 0 +{ +ImageSearch, FoundX, FoundY, 0, 0, A_ScreenWidth, A_ScreenHeight, C:\Users\XXXX\AHK_inputs\Discover1.PNG +if (ErrorLevel = 2) + MsgBox Could not conduct the search. +else if (ErrorLevel = 1) + Send {Down 6} +else if (FoundY > 500) + Send {Down 6} +else + { + ;MsgBox The icon was found at %FoundX%x%FoundY%. + FoundYY = 1 + MouseMove, FoundX, FoundY + ; Following *must* be changed to the coordates of the link grabber extension + MouseMove, 1600, 442, 10, R + } +sleep, 500 +} +Arrow = 1 +MouseMove, -100, -100, 0, R +MouseGetPos, xpos, ypos +MouseMove, 100, 100, 0, R +xpos2:= xpos+300 +ypos2:= ypos+300 +ImageSearch, FoundX2, FoundY2, %xpos%, %ypos%, %xpos2%, %ypos2%, C:\Users\XXXX\AHK_inputs\Arrow1.PNG +if (ErrorLevel = 1 or ErrorLevel = 2) + Arrow = 0 +else + Arrow = 1 +while Arrow = 1 +{ +Mouseclick +MouseMove, -100, -100, 0, R +MouseGetPos, xpos, ypos +xpos2:= xpos+300 +ypos2:= ypos+300 +sleep, 1000 +;ImageSearch, FoundX2, FoundY2, 0, 0, A_ScreenWidth, A_ScreenHeight, C:\Users\XXXX\AHK_inputs\Arrow1.PNG +ImageSearch, FoundX2, FoundY2, %xpos%, %ypos%, %xpos2%, %ypos2%, C:\Users\XXXX\AHK_inputs\Arrow1.PNG +if (ErrorLevel = 1 or ErrorLevel = 2) +{ + ;msgbox Done! + Arrow = 0 +} +else + Arrow = 1 +MouseMove, 100, 100, 0, R +} +MouseMove, 1580, 70 +Mouseclick +sleep, 1000 +MouseMove, 1800, 200 +Mouseclick +sleep, 300 +WinActivate, ahk_class XLMAIN +sleep, 800 +Clip0 = %ClipBoardAll% +ClipBoard = %ClipBoard% ; Convert to text +SendInput ^v^{Down}{Down} +sleep, 2000 +WinActivate, ahk_class Chrome_WidgetWin_1 +SendInput ^w +SendInput ^w +} +Return From 349bb6ca09dd3deffbddd057a4e142aeec903b2f Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Sun, 25 Jul 2021 15:37:50 -0400 Subject: [PATCH 14/15] Updated requirements --- requirements.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index c0ac8b9..96e6ca2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,5 +2,8 @@ pycryptodome lxml~=4.5.1 Pillow~=7.1.2 aiohttp~=3.6.2 -pyexiv2~=2.2.0 +pyexiv2~=2.3.0 +cssselect +unidecode pandas +goto \ No newline at end of file From ebf1767753a7ac96d41f0af21026f61329597b06 Mon Sep 17 00:00:00 2001 From: kolt54321 <34801180+kolt54321@users.noreply.github.com> Date: Sat, 4 Sep 2021 22:58:44 -0400 Subject: [PATCH 15/15] Push Regex Change --- tile_fetch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tile_fetch.py b/tile_fetch.py index 1f50e20..74702e5 100755 --- a/tile_fetch.py +++ b/tile_fetch.py @@ -48,7 +48,7 @@ def remove(value, deletechars): return value; class ImageInfo(object): - RE_URL_PATH_TOKEN = re.compile(rb']\r?\n,"(//[^"/]+/[^"/]+)",(?:"([^"]+)"|null)', re.MULTILINE) + RE_URL_PATH_TOKEN = re.compile(rb'],"(//[^"/]+/[^"/]+)",(?:"([^"]+)"|null)', re.MULTILINE) def __init__(self, url): page_source = urllib.request.urlopen(url).read()