Created
November 2, 2018 09:23
-
-
Save luzihang123/77c7f02304447dd2a273cca526a27e1c to your computer and use it in GitHub Desktop.
爬虫敏感图片的识别与过滤
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from functools import reduce | |
from PIL import Image | |
import requests | |
# 计算pHash(只需要三行): | |
def phash(img): | |
img = img.resize((8, 8), Image.ANTIALIAS).convert('L') | |
avg = reduce(lambda x, y: x + y, img.getdata()) / 64. | |
return reduce( | |
lambda x, y: x | (y[1] << y[0]), | |
enumerate(map(lambda i: 0 if i < avg else 1, img.getdata())), | |
0 | |
) | |
# 计算汉明距离: | |
def hamming_distance(a, b): | |
return bin(a^b).count('1') | |
# 计算两个图片是否相似: | |
def is_imgs_similar(img1,img2): | |
return True if hamming_distance(phash(img1),phash(img2)) <= 5 else False | |
# 打开本地存放一张敏感图片; | |
# 本次为了方便演示,从新浪图床拉下一张1024X1024的图片,保存命名为sensitive.jpg | |
sensitive_url="https://ws4.sinaimg.cn/large/006tNbRwgy1fwttj7bi36j30sg0sgwm0.jpg" | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5"} | |
pic = requests.get(sensitive_url, headers=headers, timeout=300) | |
if pic.status_code == 200: | |
with open("sensitive.jpg", 'wb') as f: | |
f.write(pic.content) | |
sensitive_pic = Image.open("sensitive.jpg") | |
# 爬虫获取的图片 | |
target_url="https://ws3.sinaimg.cn/large/006tNbRwgy1fwttsauo6jj30h80han0y.jpg" | |
pic = requests.get(target_url, headers=headers, timeout=300) | |
if pic.status_code == 200: | |
with open("target.jpg", 'wb') as f: | |
f.write(pic.content) | |
target_pic = Image.open("target.jpg") | |
# 判断爬虫获取的图片和敏感图片是否相似 | |
if is_imgs_similar(target_pic, sensitive_pic): | |
print("2张图片相似,替换敏感图片为”优雅的python“:{}".format("https://ws2.sinaimg.cn/large/006tNbRwgy1fw9yjmot3uj30y60y6q40.jpg")) | |
else: | |
print("不相似") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment