From 848b96a03c8b19668bf2cd78a269bd46256e240c Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy
Date: Sun, 3 Mar 2024 21:49:55 +0100
Subject: [PATCH 1/3] new: [rssfind.py] a simple script to discover RSS/Atom
feeds from an URL
---
README.md | 15 +++++++++
REQUIREMENTS | 2 ++
bin/rssfind.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 106 insertions(+)
create mode 100644 bin/rssfind.py
diff --git a/README.md b/README.md
index 89c51d4..f0ffe3d 100644
--- a/README.md
+++ b/README.md
@@ -15,6 +15,21 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen
## Tools
+### rssfind
+
+[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered.
+
+~~~shell
+Usage: Find RSS or Atom feeds from an URL
+usage: rssfind.py [options]
+
+Options:
+ -h, --help show this help message and exit
+ -l LINK, --link=LINK http link where to find one or more feed source(s)
+ -d, --disable-strict Include empty feeds in the list, default strict is
+ enabled
+~~~
+
### rsscluster
[rsscluster.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script that clusters items from an RSS feed based on a specified time interval, expressed in days.
diff --git a/REQUIREMENTS b/REQUIREMENTS
index f57478b..7aaddcd 100644
--- a/REQUIREMENTS
+++ b/REQUIREMENTS
@@ -1,2 +1,4 @@
bs4
feedparser
+orjson
+requests
diff --git a/bin/rssfind.py b/bin/rssfind.py
new file mode 100644
index 0000000..c25b6f6
--- /dev/null
+++ b/bin/rssfind.py
@@ -0,0 +1,89 @@
+#!/usr/bin/python3
+
+import sys
+import urllib.parse
+from optparse import OptionParser
+
+import feedparser
+import orjson as json
+import requests
+from bs4 import BeautifulSoup as bs4
+
+
+def findfeeds(url=None, disable_strict=False):
+ if url is None:
+ return None
+
+ raw = requests.get(url).text
+ results = []
+ discovered_feeds = []
+ html = bs4(raw, features="lxml")
+ feed_urls = html.findAll("link", rel="alternate")
+ if feed_urls:
+ for f in feed_urls:
+ tag = f.get("type", None)
+ if tag:
+ if "feed" in tag or "rss" in tag or "xml" in tag:
+ href = f.get("href", None)
+ if href:
+ discovered_feeds.append(href)
+
+ parsed_url = urllib.parse.urlparse(url)
+ base = f"{parsed_url.scheme}://{parsed_url.hostname}"
+ ahreftags = html.findAll("a")
+
+ for a in ahreftags:
+ href = a.get("href", None)
+ if href:
+ if "feed" in href or "rss" in href or "xml" in href:
+ discovered_feeds.append(f"{base}{href}")
+
+ for url in list(set(discovered_feeds)):
+ f = feedparser.parse(url)
+ if f.entries:
+ if url not in results:
+ results.append(url)
+
+ if disable_strict:
+ return list(set(discovered_feeds))
+ else:
+ return results
+
+
+version = "0.2"
+
+feedparser.USER_AGENT = (
+ "rssfind.py " + version + " +https://github.com/adulau/rss-tools"
+)
+
+usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
+
+parser = OptionParser(usage)
+
+parser.add_option(
+ "-l",
+ "--link",
+ dest="link",
+ help="http link where to find one or more feed source(s)",
+)
+
+parser.add_option(
+ "-d",
+ "--disable-strict",
+ action="store_false",
+ default=False,
+ help="Include empty feeds in the list, default strict is enabled",
+)
+
+(options, args) = parser.parse_args()
+
+if not options.link:
+ print("URL missing")
+ parser.print_help()
+ sys.exit(0)
+
+print(
+ json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode(
+ "utf-8"
+ )
+)
From 149c6b4489d34bcfb53bea8e872066386ef823ee Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy
Date: Sun, 3 Mar 2024 22:07:18 +0100
Subject: [PATCH 2/3] chg: [rssfind] set coherent `User-Agent` headers
---
bin/rssfind.py | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/bin/rssfind.py b/bin/rssfind.py
index c25b6f6..ce03b0b 100644
--- a/bin/rssfind.py
+++ b/bin/rssfind.py
@@ -14,7 +14,7 @@ def findfeeds(url=None, disable_strict=False):
if url is None:
return None
- raw = requests.get(url).text
+ raw = requests.get(url, headers=headers).text
results = []
discovered_feeds = []
html = bs4(raw, features="lxml")
@@ -52,9 +52,12 @@ def findfeeds(url=None, disable_strict=False):
version = "0.2"
-feedparser.USER_AGENT = (
- "rssfind.py " + version + " +https://github.com/adulau/rss-tools"
-)
+user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools"
+
+feedparser.USER_AGENT = user_agent
+
+
+headers = {"User-Agent": user_agent}
usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
From 4f263946929f43c36ad62af966a523a19e78bdc4 Mon Sep 17 00:00:00 2001
From: Alexandre Dulaunoy
Date: Mon, 4 Mar 2024 11:14:28 +0100
Subject: [PATCH 3/3] chg: [rssfind] added a brute-force mode `-b` to discover
potential feed source
---
README.md | 11 +++++++-
bin/rssfind.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 79 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index f0ffe3d..f655a81 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,14 @@ As 2024 marks the resurgence of RSS and Atom[^1], I decided to update my rudimen
### rssfind
-[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script for discovering RSS or Atom feeds from a URL. It returns an array in JSON format of all the potential feeds discovered.
+[rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
+
+It employs two techniques:
+
+- The first involves searching for direct link references to the feed within the HTML page.
+- The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
+
+The script returns an array in JSON format containing all the potential feeds it discovers.
~~~shell
Usage: Find RSS or Atom feeds from an URL
@@ -28,6 +35,8 @@ Options:
-l LINK, --link=LINK http link where to find one or more feed source(s)
-d, --disable-strict Include empty feeds in the list, default strict is
enabled
+ -b, --brute-force Search RSS/Atom feeds by brute-forcing url path
+ (useful if the page is missing a link entry)
~~~
### rsscluster
diff --git a/bin/rssfind.py b/bin/rssfind.py
index ce03b0b..8d528f1 100644
--- a/bin/rssfind.py
+++ b/bin/rssfind.py
@@ -1,14 +1,41 @@
#!/usr/bin/python3
+# [rssfind.py](https://github.com/adulau/rss-tools/blob/master/bin/rsscluster.py) is a simple script designed to discover RSS or Atom feeds from a given URL.
+#
+# It employs two techniques:
+#
+# - The first involves searching for direct link references to the feed within the HTML page.
+# - The second uses a brute-force approach, trying a series of known paths for feeds to determine if they are valid RSS or Atom feeds.
+#
+# The script returns an array in JSON format containing all the potential feeds it discovers.
import sys
import urllib.parse
from optparse import OptionParser
+import random
import feedparser
import orjson as json
import requests
from bs4 import BeautifulSoup as bs4
+brute_force_urls = [
+ "index.xml",
+ "feed/index.php",
+ "feed.xml",
+ "feed.atom",
+ "feed.rss",
+ "feed.json",
+ "feed.php",
+ "feed.asp",
+ "posts.rss",
+ "blog.xml",
+ "atom.xml",
+ "podcasts.xml",
+ "main.atom",
+ "main.xml",
+]
+random.shuffle(brute_force_urls)
+
def findfeeds(url=None, disable_strict=False):
if url is None:
@@ -50,13 +77,34 @@ def findfeeds(url=None, disable_strict=False):
return results
+def brutefindfeeds(url=None, disable_strict=False):
+ if url is None:
+ return None
+ found_urls = []
+ found_valid_feeds = []
+ parsed_url = urllib.parse.urlparse(url)
+ for path in brute_force_urls:
+ url = f"{parsed_url.scheme}://{parsed_url.hostname}/{path}"
+ r = requests.get(url, headers=headers)
+ if r.status_code == 200:
+ found_urls.append(url)
+ for url in list(set(found_urls)):
+ f = feedparser.parse(url)
+ if f.entries:
+ if url not in found_valid_feeds:
+ found_valid_feeds.append(url)
+ if disable_strict:
+ return list(set(found_urls))
+ else:
+ return found_valid_feeds
+
+
version = "0.2"
user_agent = f"rssfind.py {version} +https://github.com/adulau/rss-tools"
feedparser.USER_AGENT = user_agent
-
headers = {"User-Agent": user_agent}
usage = "Find RSS or Atom feeds from an URL\nusage: %prog [options]"
@@ -78,15 +126,30 @@ def findfeeds(url=None, disable_strict=False):
help="Include empty feeds in the list, default strict is enabled",
)
+parser.add_option(
+ "-b",
+ "--brute-force",
+ action="store_true",
+ default=False,
+ help="Search RSS/Atom feeds by brute-forcing url path (useful if the page is missing a link entry)",
+)
+
(options, args) = parser.parse_args()
if not options.link:
- print("URL missing")
+ print("Link/url missing - -l option")
parser.print_help()
sys.exit(0)
-print(
- json.dumps(findfeeds(options.link, disable_strict=options.disable_strict)).decode(
- "utf-8"
+if not options.brute_force:
+ print(
+ json.dumps(
+ findfeeds(url=options.link, disable_strict=options.disable_strict)
+ ).decode("utf-8")
+ )
+else:
+ print(
+ json.dumps(
+ brutefindfeeds(url=options.link, disable_strict=options.disable_strict)
+ ).decode("utf-8")
)
-)