Skip to content

Commit

Permalink
Crawler: better rss bridge twitter enhancement
Browse files Browse the repository at this point in the history
  • Loading branch information
jaesivsm committed Nov 13, 2020
1 parent 92f8b69 commit e7e3ae6
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 0 deletions.
3 changes: 3 additions & 0 deletions jarr/crawler/article_builders/rss_bridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,14 @@ def enhance(self):
return
soup = BeautifulSoup(content, 'html.parser')
og_link = self.article['link']
og_comments = self.article.get('comments')
try: # trying to find the last link in the tweet
last_link = soup.find_all('a')[-1]
self.article['comments'] = self.article['link']
self.article['link'] = last_link.attrs['href']
except (KeyError, AttributeError, TypeError, IndexError):
self.article['link'] = og_link
self.article['comments'] = og_comments
else:
try: # link is the image if the link contains the images
img = last_link.find_all('img')[0]
Expand Down
2 changes: 2 additions & 0 deletions tests/libs/rss_bridg_inte_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def test_rss_twitter_bridge_link_handling(self):
builder.enhance()
self.assertEqual("https://www.enercoop.fr/content/licoornes-les-cooper"
"atives-du-monde-dapres", builder.article['link'])
self.assertEqual(entry['link'], builder.article['comments'])
self.assertIsNone(builder.article.get('article_type'))

def test_rss_twitter_bridge_img_handling(self):
Expand All @@ -45,4 +46,5 @@ def test_rss_twitter_bridge_img_handling(self):
builder.enhance()
self.assertEqual("https://pbs.twimg.com/media/EmZUUQxXcAAxMTp.jpg",
builder.article['link'])
self.assertEqual(entry['link'], builder.article['comments'])
self.assertEqual('image', builder.article['article_type'].value)

0 comments on commit e7e3ae6

Please sign in to comment.