Created
October 11, 2020 15:20
-
-
Save zmwangx/ce643da063bc6b259e83a46dfd719946 to your computer and use it in GitHub Desktop.
googler "Top stories" support https://github.com/jarun/googler/issues/361
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
From 06e70e23f8086bb68d98893bbaeaa9df3639ef89 Mon Sep 17 00:00:00 2001 | |
From: Zhiming Wang <[email protected]> | |
Date: Sun, 11 Oct 2020 23:19:12 +0800 | |
Subject: [PATCH] Add experimental support for "Top stories" | |
--- | |
googler | 29 +++++++++++++++++++++++++++++ | |
1 file changed, 29 insertions(+) | |
diff --git a/googler b/googler | |
index b1479a7..e2865d2 100755 | |
--- a/googler | |
+++ b/googler | |
@@ -2343,6 +2343,35 @@ class GoogleParser(object): | |
cw = lambda s: re.sub(r'[ \t\n\r]+', ' ', s) if s is not None else s | |
index = 0 | |
+ | |
+ # Try to parse "Top stories". | |
+ # | |
+ # Detection doesn't work on all pages! E.g. when I search | |
+ # "covid" the layout for "Top stories" is simply different.... | |
+ carousel = tree.select('g-section-with-header g-scrolling-carousel') | |
+ if carousel: | |
+ # Devise a really crappy strategy to tell a "Top stories" | |
+ # carousel apart from a Twitter carousel, which unfortunately | |
+ # shares the same structure. | |
+ section = next(el for el in carousel.ancestors() if el.tag == 'g-section-with-header') | |
+ if section.first_element_child().select('title-with-lhs-icon'): | |
+ # This section contains a title-with-lhs-icon (":newspaper | |
+ # icon: Top stories") which a Twitter carousel doesn't have, | |
+ # a good sign... | |
+ for card in carousel.select_all('g-inner-card'): | |
+ heading = card.select('[role=heading]') | |
+ title = heading.text | |
+ a = card.select('a') | |
+ url = a.attr('href') | |
+ metadata_node = heading.parent.last_element_child() | |
+ metadata = metadata_node.text if metadata_node is not heading else '' | |
+ result = Result(index + 1, cw(title), url, '', | |
+ metadata=cw(metadata), sitelinks=[], matches=[]) | |
+ if result not in self.results: | |
+ self.results.append(result) | |
+ index += 1 | |
+ | |
+ # Regular results. | |
for div_g in tree.select_all('div.g'): | |
if div_g.select('.hp-xpdbox'): | |
# Skip smart cards. | |
-- | |
2.28.0 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment