From 2db5be3438e48cf5d362c17d8390f6c5164bbdc6 Mon Sep 17 00:00:00 2001 From: rra Date: Thu, 28 Nov 2024 21:47:00 +0100 Subject: [PATCH] follow navigation links in opds to retrieve entire catalog --- konfluks/feed.py | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/konfluks/feed.py b/konfluks/feed.py index 844e0da..8360c12 100644 --- a/konfluks/feed.py +++ b/konfluks/feed.py @@ -312,12 +312,15 @@ def create_opds_post(post_dir, entry): ft = item['type'].split('/')[-1] fn = item['rel'].split('/')[-1] - if fn == "acquisition": - fn = "publication" #calling the publications acquisition is weird + # entry.links has image, thumbnail and publication/acquisition. + # Only downloading image for now + #if fn == "acquisition": + #fn = "publication" #calling the publications acquisition is weird - prefered_name = "{}-{}.{}".format(fn, slugify(entry['title']), ft) - - grab_media(post_dir, item['href'], prefered_name) + if 'image' in fn: + prefered_name = "{}-{}.{}".format(fn, slugify(entry['title']), ft) + grab_media(post_dir, item['href'], prefered_name) + frontmatter['featured_image'] = prefered_name if "summary" in entry: summary = entry.summary @@ -333,6 +336,18 @@ def create_opds_post(post_dir, entry): timestamp = arrow.get(entry['updated_parsed']) f.write(timestamp.format('X')) +def opds_fetch_more(data): + """ + Look for mode OPDS feeds to pull, untill we no longer hit the "next" navigation property. + """ + for link in data.feed.links: + for i in link: + if link[i] == 'next': + print(link['href']) + data = grab_feed(link['href']) + return data + return None + def main(): feed_urls = open("feeds_list.txt", "r").read().splitlines() @@ -379,12 +394,23 @@ def main(): if data: #whenever we get a 200 if data.feed: #only if it is an actual feed opds_feed = False + opds_entries = [] if 'links' in data.feed: for i in data.feed['links']: if i['rel'] == 'self': if 'opds' in i['type']: opds_feed = True print("OPDS type feed!") + feed_data = data + while feed_data: + feed_data = opds_fetch_more(feed_data) + if feed_data: + for i in feed_data.entries: + opds_entries.append(i) + for i in opds_entries: + data['entries'].append(i) + + for entry in data.entries: # if 'tags' in entry: