From fecf5cd64e669fec8f488d92baadf404661d029e Mon Sep 17 00:00:00 2001 From: rra Date: Tue, 24 May 2022 15:39:11 +0200 Subject: [PATCH 1/4] add rudimentary support for enclosures & featured images --- lumbunglib/feed.py | 28 +++++++++++++++++++++++++++- lumbunglib/templates/feed.md | 1 + 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/lumbunglib/feed.py b/lumbunglib/feed.py index 1e25fdc..4113e52 100644 --- a/lumbunglib/feed.py +++ b/lumbunglib/feed.py @@ -85,6 +85,11 @@ def create_frontmatter(entry): for t in entry.tags: tags.append(t['term']) + if "featured_image" in entry: + featured_image = entry.featured_image + else: + featured_image = '' + card_type = "network" if entry.feed_name == "pen.lumbung.space": card_type = "pen" @@ -110,7 +115,8 @@ def create_frontmatter(entry): 'original_link': entry.link, 'feed_name': entry['feed_name'], 'tags': str(tags), - 'card_type': card_type + 'card_type': card_type, + 'featured_image': featured_image } return frontmatter @@ -136,11 +142,30 @@ def sanitize_yaml (frontmatter): return frontmatter +def parse_enclosures(post_dir, entry): + """ + Parses feed enclosures which are featured media + Can be featured image but also podcast entries + https://pythonhosted.org/feedparser/reference-entry-enclosures.html + """ + #TODO parse more than images + #TODO handle the fact it could be multiple items + + for e in entry.enclosures: + print("found enclosed media", e.type) + if "image/" in e.type: + featured_image = grab_media(post_dir, e.href) + entry["featured_image"] = featured_image + return entry + def create_post(post_dir, entry): """ write hugo post based on RSS entry """ + if "enclosures" in entry: + entry = parse_enclosures(post_dir, entry) + frontmatter = create_frontmatter(entry) if not os.path.exists(post_dir): @@ -202,6 +227,7 @@ def parse_posts(post_dir, post_content): for img in soup(["img", "object"]): if img.get("src") != None: + local_image = grab_media(post_dir, img["src"]) if img["src"] != local_image: img["src"] = local_image diff --git a/lumbunglib/templates/feed.md b/lumbunglib/templates/feed.md index d9f3f9a..71d984b 100644 --- a/lumbunglib/templates/feed.md +++ b/lumbunglib/templates/feed.md @@ -8,6 +8,7 @@ original_link: "{{ frontmatter.original_link }}" feed_name: "{{ frontmatter.feed_name}}" categories: ["{{ frontmatter.card_type }}", "{{ frontmatter.feed_name}}"] tags: {{ frontmatter.tags }} +{% if frontmatter.featured_image %}featured_image: "{{frontmatter.featured_image}}"{% endif %} --- {{ content }} From c84a9758871766897b08b9f6ae2eae536bc26a26 Mon Sep 17 00:00:00 2001 From: rra Date: Sun, 29 May 2022 12:30:55 +0200 Subject: [PATCH 2/4] add reason for failure --- lumbunglib/feed.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lumbunglib/feed.py b/lumbunglib/feed.py index 4113e52..a78c032 100644 --- a/lumbunglib/feed.py +++ b/lumbunglib/feed.py @@ -206,6 +206,9 @@ def grab_media(post_directory, url, prefered_name=None): shutil.copyfileobj(response.raw, media_file) print('Downloaded media item', media_item) return media_item + else: + print("Download failed", response.status_code) + return url return media_item elif os.path.exists(os.path.join(post_directory, media_item)): return media_item @@ -227,7 +230,6 @@ def parse_posts(post_dir, post_content): for img in soup(["img", "object"]): if img.get("src") != None: - local_image = grab_media(post_dir, img["src"]) if img["src"] != local_image: img["src"] = local_image From cab36c8ac63acd7ceadcfcf8856cf2f3607bf8ed Mon Sep 17 00:00:00 2001 From: rra Date: Sun, 29 May 2022 14:45:11 +0200 Subject: [PATCH 3/4] add less generic headers --- lumbunglib/feed.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lumbunglib/feed.py b/lumbunglib/feed.py index a78c032..f5a8d42 100644 --- a/lumbunglib/feed.py +++ b/lumbunglib/feed.py @@ -194,13 +194,17 @@ def grab_media(post_directory, url, prefered_name=None): """ media_item = urlparse(url).path.split('/')[-1] + headers = { + 'User-Agent': 'https://git.autonomic.zone/ruangrupa/lumbunglib', + 'From': 'info@lumbung.space' # This is another valid field + } if prefered_name: media_item = prefered_name try: if not os.path.exists(os.path.join(post_directory, media_item)): #TODO: stream is true is a conditional so we could check the headers for things, mimetype etc - response = requests.get(url, stream=True) + response = requests.get(url, headers=headers, stream=True) if response.ok: with open(os.path.join(post_directory, media_item), 'wb') as media_file: shutil.copyfileobj(response.raw, media_file) From ad591ea9cf89d896a53fdc3b649f0b45f9371c00 Mon Sep 17 00:00:00 2001 From: rra Date: Wed, 1 Jun 2022 05:51:25 +0200 Subject: [PATCH 4/4] add more checks for failures --- lumbunglib/feed.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/lumbunglib/feed.py b/lumbunglib/feed.py index f5a8d42..f51d3e9 100644 --- a/lumbunglib/feed.py +++ b/lumbunglib/feed.py @@ -152,10 +152,13 @@ def parse_enclosures(post_dir, entry): #TODO handle the fact it could be multiple items for e in entry.enclosures: - print("found enclosed media", e.type) - if "image/" in e.type: - featured_image = grab_media(post_dir, e.href) - entry["featured_image"] = featured_image + if "type" in e: + print("found enclosed media", e.type) + if "image/" in e.type: + featured_image = grab_media(post_dir, e.href) + entry["featured_image"] = featured_image + else: + print("FIXME:ignoring enclosed", e.type) return entry @@ -267,11 +270,12 @@ def grab_feed(feed_url): print(e) return False - print(data.status, feed_url) - if data.status == 200: - # 304 means the feed has not been modified since we last checked - write_etag(feed_name, data) - return data + if "status" in data: + print(data.status, feed_url) + if data.status == 200: + # 304 means the feed has not been modified since we last checked + write_etag(feed_name, data) + return data return False def create_opds_post(post_dir, entry):