From 848b96a03c8b19668bf2cd78a269bd46256e240c Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Sun, 3 Mar 2024 21:49:55 +0100 Subject: [PATCH 1/3] new: [rssfind.py] a simple script to discover RSS/Atom feeds from an URL --- README.md | 15 +++++++++ REQUIREMENTS | 2 ++ bin/rssfind.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 bin/rssfind.py diff --git a/README.md b/README.md index 89c51d4..f0ffe3d 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,21 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen ## Tools +### rssfind + +[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered. + +~~~shell +Usage: Find RSS or Atom feeds from an URL +usage: rssfind.py [options] + +Options: + -h, --help show this help message and exit + -l LINK, --link=LINK http link where to find one or more feed source(s) + -d, --disable-strict Include empty feeds in the list, default strict is + enabled +~~~ + ### rsscluster [rsscluster.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script that clusters items from an RSS feed based on a specified time interval, expressed in days. diff --git a/REQUIREMENTS b/REQUIREMENTS index f57478b..7aaddcd 100644 --- a/REQUIREMENTS +++ b/REQUIREMENTS @@ -1,2 +1,4 @@ bs4 feedparser +orjson +requests diff --git a/bin/rssfind.py b/bin/rssfind.py new file mode 100644 index 0000000..c25b6f6 --- /dev/null +++ b/bin/rssfind.py @@ -0,0 +1,89 @@ +#!/usr/bin/python3 + +import sys +import urllib.parse +from optparse import OptionParser + +import feedparser +import orjson as json +import requests +from bs4 import BeautifulSoup as bs4 + + +def findfeeds(url=None, disable_strict=False): + if url is None: + return None + + raw = requests.get(url).text + results = [] + discovered_feeds = [] + html = bs4(raw, features="lxml") + feed_urls = html.findAll("link", rel="alternate") + if feed_urls: + for f in feed_urls: + tag = f.get("type", None) + if tag: + if "feed" in tag or "rss" in tag or "xml" in tag: + href = f.get("href", None) + if href: + discovered_feeds.append(href) + + parsed_url = urllib.parse.urlparse(url) + base = f"{parsed_url.scheme}://{parsed_url.hostname}" + ahreftags = html.findAll("a") + + for a in ahreftags: + href = a.get("href", None) + if href: + if "feed" in href or "rss" in href or "xml" in href: + discovered_feeds.append(f"{base}{href}") + + for url in list(set(discovered_feeds)): + f = feedparser.parse(url) + if f.entries: + if url not in results: + results.append(url) + + if disable_strict: + return list(set(discovered_feeds)) + else: + return results + + +version = "0.2" + +feedparser.USER_AGENT = ( + "rssfind.py " + version + " +https://github.com/adulau/rss-tools" +) + +usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]" + +parser = OptionParser(usage) + +parser.add_option( + "-l", + "--link", + dest="link", + help="http link where to find one or more feed source(s)", +) + +parser.add_option( + "-d", + "--disable-strict", + action="store_false", + default=False, + help="Include empty feeds in the list, default strict is enabled", +) + +(options, args) = parser.parse_args() + +if not options.link: + print("URL missing") + parser.print_help() + sys.exit(0) + +print( + json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode( + "utf-8" + ) +) From 149c6b4489d34bcfb53bea8e872066386ef823ee Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Sun, 3 Mar 2024 22:07:18 +0100 Subject: [PATCH 2/3] chg: [rssfind] set coherent `User-Agent` headers --- bin/rssfind.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/bin/rssfind.py b/bin/rssfind.py index c25b6f6..ce03b0b 100644 --- a/bin/rssfind.py +++ b/bin/rssfind.py @@ -14,7 +14,7 @@ def findfeeds(url=None, disable_strict=False): if url is None: return None - raw = requests.get(url).text + raw = requests.get(url, headers=headers).text results = [] discovered_feeds = [] html = bs4(raw, features="lxml") @@ -52,9 +52,12 @@ def findfeeds(url=None, disable_strict=False): version = "0.2" -feedparser.USER_AGENT = ( - "rssfind.py " + version + " +https://github.com/adulau/rss-tools" -) +user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools" + +feedparser.USER_AGENT = user_agent + + +headers = {"User-Agent": user_agent} usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]" From 4f263946929f43c36ad62af966a523a19e78bdc4 Mon Sep 17 00:00:00 2001 From: Alexandre Dulaunoy Date: Mon, 4 Mar 2024 11:14:28 +0100 Subject: [PATCH 3/3] chg: [rssfind] added a brute-force mode `-b` to discover potential feed source --- README.md | 11 +++++++- bin/rssfind.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 79 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index f0ffe3d..f655a81 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,14 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen ### rssfind -[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered. +[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL. + +It employs two techniques: + +- The first involves searching for direct link references to the feed within the HTML page. +- The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds. + +The script returns an array in JSON format containing all the potential feeds it discovers. ~~~shell Usage: Find RSS or Atom feeds from an URL @@ -28,6 +35,8 @@ Options: -l LINK, --link=LINK http link where to find one or more feed source(s) -d, --disable-strict Include empty feeds in the list, default strict is enabled + -b, --brute-force Search RSS/Atom feeds by brute-forcing url path + (useful if the page is missing a link entry) ~~~ ### rsscluster diff --git a/bin/rssfind.py b/bin/rssfind.py index ce03b0b..8d528f1 100644 --- a/bin/rssfind.py +++ b/bin/rssfind.py @@ -1,14 +1,41 @@ #!/usr/bin/python3 +# [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL. +# +# It employs two techniques: +# +# - The first involves searching for direct link references to the feed within the HTML page. +# - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds. +# +# The script returns an array in JSON format containing all the potential feeds it discovers. import sys import urllib.parse from optparse import OptionParser +import random import feedparser import orjson as json import requests from bs4 import BeautifulSoup as bs4 +brute_force_urls = [ + "index.xml", + "feed/index.php", + "feed.xml", + "feed.atom", + "feed.rss", + "feed.json", + "feed.php", + "feed.asp", + "posts.rss", + "blog.xml", + "atom.xml", + "podcasts.xml", + "main.atom", + "main.xml", +] +random.shuffle(brute_force_urls) + def findfeeds(url=None, disable_strict=False): if url is None: @@ -50,13 +77,34 @@ def findfeeds(url=None, disable_strict=False): return results +def brutefindfeeds(url=None, disable_strict=False): + if url is None: + return None + found_urls = [] + found_valid_feeds = [] + parsed_url = urllib.parse.urlparse(url) + for path in brute_force_urls: + url = f"{parsed_url.scheme}://{parsed_url.hostname}/{path}" + r = requests.get(url, headers=headers) + if r.status_code == 200: + found_urls.append(url) + for url in list(set(found_urls)): + f = feedparser.parse(url) + if f.entries: + if url not in found_valid_feeds: + found_valid_feeds.append(url) + if disable_strict: + return list(set(found_urls)) + else: + return found_valid_feeds + + version = "0.2" user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools" feedparser.USER_AGENT = user_agent - headers = {"User-Agent": user_agent} usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]" @@ -78,15 +126,30 @@ def findfeeds(url=None, disable_strict=False): help="Include empty feeds in the list, default strict is enabled", ) +parser.add_option( + "-b", + "--brute-force", + action="store_true", + default=False, + help="Search RSS/Atom feeds by brute-forcing url path (useful if the page is missing a link entry)", +) + (options, args) = parser.parse_args() if not options.link: - print("URL missing") + print("Link/url missing - -l option") parser.print_help() sys.exit(0) -print( - json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode( - "utf-8" +if not options.brute_force: + print( + json.dumps( + findfeeds(url=options.link, disable_strict=options.disable_strict) + ).decode("utf-8") + ) +else: + print( + json.dumps( + brutefindfeeds(url=options.link, disable_strict=options.disable_strict) + ).decode("utf-8") ) -)