Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save zmwangx/ce643da063bc6b259e83a46dfd719946 to your computer and use it in GitHub Desktop.
Save zmwangx/ce643da063bc6b259e83a46dfd719946 to your computer and use it in GitHub Desktop.
googler "Top stories" support https://github.com/jarun/googler/issues/361
From 06e70e23f8086bb68d98893bbaeaa9df3639ef89 Mon Sep 17 00:00:00 2001
From: Zhiming Wang <[email protected]>
Date: Sun, 11 Oct 2020 23:19:12 +0800
Subject: [PATCH] Add experimental support for "Top stories"
---
googler | 29 +++++++++++++++++++++++++++++
1 file changed, 29 insertions(+)
diff --git a/googler b/googler
index b1479a7..e2865d2 100755
--- a/googler
+++ b/googler
@@ -2343,6 +2343,35 @@ class GoogleParser(object):
cw = lambda s: re.sub(r'[ \t\n\r]+', ' ', s) if s is not None else s
index = 0
+
+ # Try to parse "Top stories".
+ #
+ # Detection doesn't work on all pages! E.g. when I search
+ # "covid" the layout for "Top stories" is simply different....
+ carousel = tree.select('g-section-with-header g-scrolling-carousel')
+ if carousel:
+ # Devise a really crappy strategy to tell a "Top stories"
+ # carousel apart from a Twitter carousel, which unfortunately
+ # shares the same structure.
+ section = next(el for el in carousel.ancestors() if el.tag == 'g-section-with-header')
+ if section.first_element_child().select('title-with-lhs-icon'):
+ # This section contains a title-with-lhs-icon (":newspaper
+ # icon: Top stories") which a Twitter carousel doesn't have,
+ # a good sign...
+ for card in carousel.select_all('g-inner-card'):
+ heading = card.select('[role=heading]')
+ title = heading.text
+ a = card.select('a')
+ url = a.attr('href')
+ metadata_node = heading.parent.last_element_child()
+ metadata = metadata_node.text if metadata_node is not heading else ''
+ result = Result(index + 1, cw(title), url, '',
+ metadata=cw(metadata), sitelinks=[], matches=[])
+ if result not in self.results:
+ self.results.append(result)
+ index += 1
+
+ # Regular results.
for div_g in tree.select_all('div.g'):
if div_g.select('.hp-xpdbox'):
# Skip smart cards.
--
2.28.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment