-
-
Save cdpath/fcf4c59e933275e5db2758920a9c1fd8 to your computer and use it in GitHub Desktop.
| { | |
| "schema_version": 1, | |
| "name": "WeChat", | |
| "description": "微信公众号全文 RSS", | |
| "source_url": false, | |
| "guid": "dd67102f09869c2228f8ed903a32d063", | |
| "tag_fg_color": "#333333", | |
| "tag_bg_color": "#66ff66", | |
| "icon": "leaf", | |
| "exported_at": "2019-01-12T10:56:41Z", | |
| "agents": [ | |
| { | |
| "type": "Agents::WebsiteAgent", | |
| "name": "0 获取微信公众号文章", | |
| "disabled": false, | |
| "guid": "00a23c266080e989591e35697d91b21e", | |
| "options": { | |
| "expected_update_period_in_days": "2", | |
| "url": [ | |
| "http://weixin.sogou.com/weixin?type=1&s_from=input&query=pongba_mindhacks", | |
| "http://weixin.sogou.com/weixin?type=1&s_from=input&query=ling-lunch", | |
| "http://weixin.sogou.com/weixin?type=1&s_from=input&query=noon-story", | |
| "http://weixin.sogou.com/weixin?type=1&s_from=input&query=mzmojo" | |
| ], | |
| "type": "html", | |
| "mode": "on_change", | |
| "extract": { | |
| "title": { | |
| "css": "#sogou_vr_11002301_box_0 > dl:last>dd>a", | |
| "value": ".//text()" | |
| }, | |
| "url": { | |
| "css": "#sogou_vr_11002301_box_0 > dl:last>dd>a", | |
| "value": "@href" | |
| } | |
| }, | |
| "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0" | |
| }, | |
| "schedule": "every_12h", | |
| "keep_events_for": 172800, | |
| "propagate_immediately": true | |
| }, | |
| { | |
| "type": "Agents::DataOutputAgent", | |
| "name": "4 输出RSS", | |
| "disabled": false, | |
| "guid": "1ea34bb7e56575e6c007bf2e2f48b990", | |
| "options": { | |
| "secrets": [ | |
| "wechat202" | |
| ], | |
| "expected_receive_period_in_days": 2, | |
| "template": { | |
| "title": "微信公众号", | |
| "description": "微信公众号全文", | |
| "item": { | |
| "title": "{{author}} | {{title}}", | |
| "description": "{{fulltext}}", | |
| "link": "{{url}}" | |
| }, | |
| "icon": "https://res.wx.qq.com/mmbizwap/zh_CN/htmledition/images/icon/common/favicon22c41b.ico" | |
| }, | |
| "rss_content_type": "text/xml" | |
| }, | |
| "propagate_immediately": true | |
| }, | |
| { | |
| "type": "Agents::DeDuplicationAgent", | |
| "name": "1 标题去重", | |
| "disabled": false, | |
| "guid": "6406a562f112d0686bf2ae24afcac902", | |
| "options": { | |
| "property": "{{title}}", | |
| "lookback": "200", | |
| "expected_update_period_in_days": "6" | |
| }, | |
| "keep_events_for": 345600, | |
| "propagate_immediately": true | |
| }, | |
| { | |
| "type": "Agents::TriggerAgent", | |
| "name": "2 过滤广告", | |
| "disabled": false, | |
| "guid": "822097071177a8c9d57eb0aea8b7554f", | |
| "options": { | |
| "expected_receive_period_in_days": "6", | |
| "keep_event": "true", | |
| "rules": [ | |
| { | |
| "type": "!regex", | |
| "value": "(市集)|(广告)|(推广)|(招人)|(限时)|(福利)", | |
| "path": "title" | |
| } | |
| ], | |
| "message": "没有看到广告,放行!" | |
| }, | |
| "keep_events_for": 259200, | |
| "propagate_immediately": true | |
| }, | |
| { | |
| "type": "Agents::WebsiteAgent", | |
| "name": "3 获取文章全文", | |
| "disabled": false, | |
| "guid": "c5b16d43997a5195533911e8d1824711", | |
| "options": { | |
| "expected_update_period_in_days": "2", | |
| "url_from_event": "{{url}}", | |
| "type": "html", | |
| "mode": "merge", | |
| "extract": { | |
| "fulltext": { | |
| "css": "#js_content", | |
| "value": "." | |
| }, | |
| "title_": { | |
| "css": "#activity-name", | |
| "value": "normalize-space(.)" | |
| }, | |
| "author": { | |
| "css": "#profileBt > a", | |
| "value": "normalize-space(.)" | |
| } | |
| }, | |
| "template": { | |
| "fulltext": "{{ fulltext |strip_newlines|replace: \"<br>\",\"\" | regex_replace:'data-src','src'}}" | |
| }, | |
| "user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0" | |
| }, | |
| "schedule": "every_12h", | |
| "keep_events_for": 604800, | |
| "propagate_immediately": true | |
| } | |
| ], | |
| "links": [ | |
| { | |
| "source": 0, | |
| "receiver": 2 | |
| }, | |
| { | |
| "source": 2, | |
| "receiver": 3 | |
| }, | |
| { | |
| "source": 3, | |
| "receiver": 4 | |
| }, | |
| { | |
| "source": 4, | |
| "receiver": 1 | |
| } | |
| ], | |
| "control_links": [ | |
| ] | |
| } |
有两个问题想请教一下,就是按照您这个json,替换掉里面的两类微信号后,在inoreader里面标题显示有点问题,像这样子
<h2 class="rich_media_title" id="activity-name"> 海南再无“世界岛” </h2>
另外就是现在这个json好像不能抓到图片?
- 标题在 inoreader 网页版和 Reeder3 上没有发现问题。
- 微信的图片链接会超时,尽量让 RSS 阅读器及时缓存到本地。
如果发现 RSS 停止更新,
- 移除失效的公众号搜索链接,比如 http://weixin.sogou.com/weixin?type=1&s_from=input&query=XXX_XXX
- 更换 IP,如果部署在 Heroku 直接重启,可参考 Why is my app's IP address being blocked by third parties? - Heroku Help
Why did I fail when importing it to Scenarios?
“1 error prohibited this Scenario from being imported:
The provided data does not appear to be a valid Scenario.”
@zacharycode fixed
楼主您好,测试过程发现加入微信文章是一篇图文,或者是一篇转载之类的文章,总结而来就是activity-name元素找不到的时候,会报如下错误:
Error when fetching url: Got an uneven number of matches for : {"fulltext"=>{"css"=>"#js_content", "value"=>"."}, "title"=>{"css"=>"#activity-name", "value"=>"normalize-space(.)"}}
知识不熟,找到问题但是没办法解决,楼主看看能不能帮忙解决一下,谢谢。
@cdpath 楼主您好,测试过程发现加入微信文章是一篇图文,或者是一篇转载之类的文章,总结而来就是activity-name元素找不到的时候,会报如下错误:
Error when fetching url: Got an uneven number of matches for : {"fulltext"=>{"css"=>"#js_content", "value"=>"."}, "title"=>{"css"=>"#activity-name", "value"=>"normalize-space(.)"}}
知识不熟,找到问题但是没办法解决,楼主看看能不能帮忙解决一下,谢谢。
@longhaiqwe 出错的微信文章的链接发一下
最新的反爬虫该Scenario 不可用,我写了个新的 https://gist.github.com/dequn/674b0401c1f31f7919b112ad64640552。
楼主这个现在还能使用吗 @cdpath
量比较大(几十个号)的情况下好像会命中搜狗的反爬虫,获取那一步数据就是空的了