Created
August 3, 2019 10:16
-
-
Save dequn/78f100cdd78e72aced9462be06892566 to your computer and use it in GitHub Desktop.
Huginn抓取微信公众号文章,只能抓取最近一篇,20190803更新
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"schema_version": 1, | |
"name": "微信公众号订阅【第二版只有最新的一篇文章】", | |
"description": "2019.08.03 更新,最新可用", | |
"source_url": false, | |
"guid": "a0be876a712baf52fa4270d6b174443a", | |
"tag_fg_color": "#ffffff", | |
"tag_bg_color": "#00b050", | |
"icon": "eye", | |
"exported_at": "2019-08-03T10:05:30Z", | |
"agents": [ | |
{ | |
"type": "Agents::WebsiteAgent", | |
"name": "微信公众号【第二版】 #4 Fetch 跳转URL", | |
"disabled": false, | |
"guid": "07771b5a19dc50901cdece327f56a0ba", | |
"options": { | |
"expected_update_period_in_days": "2", | |
"url_from_event": "https://weixin.sogou.com{{url}}", | |
"type": "text", | |
"user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36", | |
"headers": { | |
"Referer": "{{refer}}", | |
"Cookie": "{{cookie}}" | |
}, | |
"mode": "on_change", | |
"extract": { | |
"content": { | |
"index": "0", | |
"regexp": "\\A(?m:.)*\\z" | |
} | |
} | |
}, | |
"schedule": "never", | |
"keep_events_for": 0, | |
"propagate_immediately": false | |
}, | |
{ | |
"type": "Agents::JavaScriptAgent", | |
"name": "微信公众号【第二版】 #5 解析跳转URL", | |
"disabled": false, | |
"guid": "12a0dbce60dde12672ca32132bfaedca", | |
"options": { | |
"language": "JavaScript", | |
"code": "Agent.receive = function() {\r\n var events = this.incomingEvents();\r\n for(var i = 0; i < events.length; i++) {\r\n var s = events[i].payload['content'];\r\n var reg = /\\+\\= '(.*)';/g;\r\n var url = '';\r\n while (temp = reg.exec(s)) {\r\n url += temp[1];\r\n }\r\n this.createEvent({ 'url': url });\r\n }\r\n}", | |
"expected_receive_period_in_days": "2", | |
"expected_update_period_in_days": "2" | |
}, | |
"schedule": "never", | |
"keep_events_for": 3600, | |
"propagate_immediately": false | |
}, | |
{ | |
"type": "Agents::DeDuplicationAgent", | |
"name": "微信公众号【第二版】 #2 去除重复", | |
"disabled": false, | |
"guid": "3250efcc923ce4c10b30763c6e0365ba", | |
"options": { | |
"property": "{{title}}", | |
"lookback": "200", | |
"expected_update_period_in_days": "20" | |
}, | |
"keep_events_for": 15552000, | |
"propagate_immediately": true | |
}, | |
{ | |
"type": "Agents::WebsiteAgent", | |
"name": "微信公众号【第二版】 #1 搜索公众号", | |
"disabled": false, | |
"guid": "3ea551c97594a56ac09641eea130820a", | |
"options": { | |
"expected_update_period_in_days": "4", | |
"_comment": [ | |
"中国国家地理", | |
"柴知道" | |
], | |
"url": [ | |
"https://weixin.sogou.com/weixin?type=1&query=dili360&ie=utf8&s_from=input&_sug_=y&_sug_type_=", | |
"https://weixin.sogou.com/weixin?type=1&query=chaiknows&ie=utf8&s_from=input&_sug_=y&_sug_type_=" | |
], | |
"type": "html", | |
"mode": "on_change", | |
"template": { | |
"cookie": "{{ _response_.headers.Set-Cookie }}", | |
"refer": "{{ _url_ }}" | |
}, | |
"extract": { | |
"title": { | |
"css": "#sogou_vr_11002301_box_0 > dl > dd > a", | |
"value": "string(.)" | |
}, | |
"url": { | |
"css": "#sogou_vr_11002301_box_0 > dl > dd > a", | |
"value": "@href" | |
} | |
}, | |
"user_agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36" | |
}, | |
"schedule": "2am", | |
"keep_events_for": 172800, | |
"propagate_immediately": false | |
}, | |
{ | |
"type": "Agents::DataOutputAgent", | |
"name": "微信公众号【第二版】 #7 输出RSS", | |
"disabled": false, | |
"guid": "cbc2ced20c71d435a5bd375b3e58079a", | |
"options": { | |
"secrets": [ | |
"weixin_public" | |
], | |
"expected_receive_period_in_days": "20", | |
"template": { | |
"title": "微信公众号【第二版】", | |
"description": "微信公众号【第二版】", | |
"item": { | |
"title": "【{{author}}】{{title}}", | |
"description": "{{ desciption | replace: 'data-src', 'src' }}", | |
"link": "{{url}}" | |
}, | |
"link": "https://weixin.sogou.com/", | |
"icon": "https://res.wx.qq.com/mmbizwap/zh_CN/htmledition/images/icon/common/favicon22c41b.ico" | |
}, | |
"ns_media": "true", | |
"events_to_show": "300" | |
}, | |
"propagate_immediately": true | |
}, | |
{ | |
"type": "Agents::JavaScriptAgent", | |
"name": "微信公众号【第二版】 #3 按搜狗前端代码重新构造URL", | |
"disabled": false, | |
"guid": "e90d2135c2cf93c1dbc2b6e8b78bfeea", | |
"options": { | |
"language": "JavaScript", | |
"code": "Agent.receive = function() {\r\n var events = this.incomingEvents();\r\n for(var i = 0; i < events.length; i++) {\r\n // concat url\r\n var url = events[i].payload['url'];\r\n var b = Math.floor(100 * Math.random()) + 1\r\n , a = url.indexOf(\"url=\")\r\n , c = url.indexOf(\"&k=\");\r\n -1 !== a && -1 === c && (a = url.substr(a + 4 + parseInt(\"21\") + b, 1),\r\n url += \"&k=\" + b + \"&h=\" + a);\r\n events[i].payload['url'] = url;\r\n \r\n // extract SNUID cookie\r\n var re = /SNUID=\\w*/;\r\n var cookie = re.exec(events[i].payload['cookie'])[0];\r\n events[i].payload['cookie'] = cookie;\r\n \r\n // emit\r\n this.createEvent(events[i].payload);\r\n }\r\n}", | |
"expected_receive_period_in_days": "2", | |
"expected_update_period_in_days": "2" | |
}, | |
"schedule": "never", | |
"keep_events_for": 0, | |
"propagate_immediately": false | |
}, | |
{ | |
"type": "Agents::WebsiteAgent", | |
"name": "微信公众号【第二版】 #6 获取单篇文章全文", | |
"disabled": false, | |
"guid": "f7c52fcd85cea0d7426978202f4dc69a", | |
"options": { | |
"expected_update_period_in_days": "20", | |
"url_from_event": "{{url}}", | |
"type": "html", | |
"mode": "merge", | |
"extract": { | |
"desciption": { | |
"css": "#img-content", | |
"value": "./node()" | |
}, | |
"author": { | |
"css": "#js_name", | |
"value": "string(.)" | |
}, | |
"title": { | |
"css": "#activity-name", | |
"value": "string(.)" | |
} | |
} | |
}, | |
"schedule": "never", | |
"keep_events_for": 604800, | |
"propagate_immediately": true | |
} | |
], | |
"links": [ | |
{ | |
"source": 0, | |
"receiver": 1 | |
}, | |
{ | |
"source": 1, | |
"receiver": 6 | |
}, | |
{ | |
"source": 2, | |
"receiver": 5 | |
}, | |
{ | |
"source": 3, | |
"receiver": 2 | |
}, | |
{ | |
"source": 5, | |
"receiver": 0 | |
}, | |
{ | |
"source": 6, | |
"receiver": 4 | |
} | |
], | |
"control_links": [ | |
] | |
} |
刚测试了一下,发现不能执行?代码里有需要改动的地方吗?
不需要呐,我的现在还在正确运行呢,你看一下每个Agent的日志和Event输出,看看是在哪一步没有出正确结果,贴出来才能解决呐。
可能是之前我的ip被封了,从第一步开始结果就是空,刚才又重试了一次,抓取正常,多谢分享代码。
请问一下这个该怎么修改成我自己想要抓取的公众号
我只会简单的爬虫
请问一下这个该怎么修改成我自己想要抓取的公众号
我只会简单的爬虫
name 为 “微信公众号【第二版】 #1 搜索公众号” 的Agent中,url列表即为抓取的目标,以 "https://weixin.sogou.com/weixin?type=1&query=dili360&ie=utf8&s_from=input&_sug_=y&_sug_type_=" 为例,其中query=dili360 的dili360就是要抓取的公众号id,可以在搜狗微信搜索中查到。
大神,如果想添加一个公众号怎么操作?
@onewk 看楼上
好像现在搜狗的微信公众号也不更新了
好像现在搜狗的微信公众号也不更新了
越来越封闭了。以前是没能力连接信息,现在是商业竞争不允许连接信息,对用户来说,没什么进步。
你好 请问下 这个方法现在还能用吗
我也好久没用了,你试一试吧
…Sent from my iPhone
On Aug 20, 2020, at 12:10 PM, Yuan Huang ***@***.***> wrote:
***@***.*** commented on this gist.
你好 请问下 这个方法现在还能用吗
—
You are receiving this because you authored the thread.
Reply to this email directly, view it on GitHub, or unsubscribe.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
刚测试了一下,发现不能执行?代码里有需要改动的地方吗?