Skip to content

Instantly share code, notes, and snippets.

@wastee
Created October 19, 2021 06:57
Show Gist options
  • Save wastee/d6b56b7c06fe08deca0401296498a17d to your computer and use it in GitHub Desktop.
Save wastee/d6b56b7c06fe08deca0401296498a17d to your computer and use it in GitHub Desktop.
使用 FFmpeg 和 Python,根据视频内容来找出重复的视频
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "40f15af6",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:22.127623Z",
"start_time": "2021-10-19T06:56:22.086793Z"
}
},
"outputs": [],
"source": [
"import ffmpeg\n",
"from diffimg import diff\n",
"from glob import glob\n",
"from IPython.display import Image"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "46c97367",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:22.131302Z",
"start_time": "2021-10-19T06:56:22.129020Z"
}
},
"outputs": [],
"source": [
"video_path = '/home/tee/temp/工作/视频2/*.mp4'\n",
"img_path = '/home/tee/temp/工作/去重截图/'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "d522b039",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:22.135920Z",
"start_time": "2021-10-19T06:56:22.132984Z"
}
},
"outputs": [],
"source": [
"mp4s = glob(video_path)"
]
},
{
"cell_type": "markdown",
"id": "0febcac9",
"metadata": {},
"source": [
"# 文件名去重检查"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "153e85d1",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:22.141599Z",
"start_time": "2021-10-19T06:56:22.137799Z"
}
},
"outputs": [],
"source": [
"mp4s_name = [mp4.split('/')[-1] for mp4 in mp4s]\n",
"\n",
"assert set([x for x in mp4s_name if mp4s_name.count(x) > 1]) == set()"
]
},
{
"cell_type": "markdown",
"id": "205e3ff6",
"metadata": {},
"source": [
"# 取样截图"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ba3cf54a",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:22.146402Z",
"start_time": "2021-10-19T06:56:22.142494Z"
}
},
"outputs": [],
"source": [
"def save_img(q, q_name):\n",
" out, _ = (\n",
" ffmpeg\n",
" .input(mp4, ss=q)\n",
" .filter('select', 'gte(n,{})'.format(1))\n",
" .output('pipe:', vframes=1, format='image2', vcodec='mjpeg')\n",
" .global_args('-loglevel', 'error')\n",
" .global_args('-y')\n",
" .run(capture_stdout=True)\n",
" )\n",
"\n",
" with open(img_path + mp4.split('/')[-1][:-4] + f'_{q_name}.jpg', 'wb') as f:\n",
" f.write(out)\n",
" return out"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "27166435",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:28.653726Z",
"start_time": "2021-10-19T06:56:22.147624Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[处理中] ... /home/tee/temp/工作/视频2/横版_5.mp4\n",
"[处理中] ... /home/tee/temp/工作/视频2/横版_1.mp4\n",
"[处理中] ... /home/tee/temp/工作/视频2/横版_2.mp4\n",
"[处理中] ... /home/tee/temp/工作/视频2/横版_3.mp4\n",
"[处理中] ... /home/tee/temp/工作/视频2/横版_7.mp4\n",
"[处理中] ... /home/tee/temp/工作/视频2/横版_4.mp4\n",
"[处理中] ... /home/tee/temp/工作/视频2/横版_6.mp4\n"
]
}
],
"source": [
"for mp4 in mp4s:\n",
" \n",
" print(f'[处理中] ... {mp4}')\n",
" \n",
" duration = float(ffmpeg.probe(mp4)['streams'][0]['duration'])\n",
" q_add_on = duration / 5\n",
"\n",
" q1 = q_add_on\n",
" q2 = q1 + q_add_on\n",
" q3 = q2 + q_add_on\n",
" q4 = q3 + q_add_on\n",
"\n",
" save_img(q1, 'q1')\n",
" save_img(q2, 'q2')\n",
" save_img(q3, 'q3')\n",
" save_img(q4, 'q4')\n",
" \n",
"# break"
]
},
{
"cell_type": "markdown",
"id": "1c5ce132",
"metadata": {},
"source": [
"# diff 图片"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "e36aaac8",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:28.658794Z",
"start_time": "2021-10-19T06:56:28.655072Z"
}
},
"outputs": [],
"source": [
"q1_list = glob(f'{img_path}*_q1.jpg')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f7d2bca2",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:28.671874Z",
"start_time": "2021-10-19T06:56:28.661737Z"
}
},
"outputs": [],
"source": [
"def diff_list(q1_list, ori_str, dst_str):\n",
" q2_list = []\n",
" for q1 in q1_list[1:]:\n",
" print(f'[检查 {ori_str} 中] ... {q1}')\n",
" if diff(q1_list[0], q1) < 0.1:\n",
" q2_list.append(q1)\n",
" q2_list = [q1_list[0]] + q2_list\n",
" q2_list = [q2.replace(ori_str, dst_str) for q2 in q2_list]\n",
" \n",
" return q2_list"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "bf615c4b",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:30.071298Z",
"start_time": "2021-10-19T06:56:28.673494Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[检查 _q1.jpg 中] ... /home/tee/temp/工作/去重截图/横版_5_q1.jpg\n",
"[检查 _q1.jpg 中] ... /home/tee/temp/工作/去重截图/横版_4_q1.jpg\n",
"[检查 _q1.jpg 中] ... /home/tee/temp/工作/去重截图/横版_7_q1.jpg\n",
"[检查 _q1.jpg 中] ... /home/tee/temp/工作/去重截图/横版_3_q1.jpg\n",
"[检查 _q1.jpg 中] ... /home/tee/temp/工作/去重截图/横版_1_q1.jpg\n",
"[检查 _q1.jpg 中] ... /home/tee/temp/工作/去重截图/横版_6_q1.jpg\n"
]
}
],
"source": [
"q2_list = diff_list(q1_list, '_q1.jpg', '_q2.jpg')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "bb3f51eb",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:30.153994Z",
"start_time": "2021-10-19T06:56:30.073801Z"
},
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[检查 _q2.jpg 中] ... /home/tee/temp/工作/去重截图/横版_7_q2.jpg\n",
"[检查 _q2.jpg 中] ... /home/tee/temp/工作/去重截图/横版_6_q2.jpg\n"
]
}
],
"source": [
"q3_list = diff_list(q2_list, '_q2.jpg', '_q3.jpg')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "78121dec",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:30.248378Z",
"start_time": "2021-10-19T06:56:30.155230Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[检查 _q3.jpg 中] ... /home/tee/temp/工作/去重截图/横版_7_q3.jpg\n",
"[检查 _q3.jpg 中] ... /home/tee/temp/工作/去重截图/横版_6_q3.jpg\n"
]
}
],
"source": [
"q4_list = diff_list(q3_list, '_q3.jpg', '_q4.jpg')"
]
},
{
"cell_type": "markdown",
"id": "3a2698ce",
"metadata": {},
"source": [
"# diff 结果"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "89d50aee",
"metadata": {
"ExecuteTime": {
"end_time": "2021-10-19T06:56:30.253653Z",
"start_time": "2021-10-19T06:56:30.249814Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"有重复视频\n",
"\n",
"['横版_2', '横版_7', '横版_6']\n"
]
}
],
"source": [
"if q4_list == []:\n",
" print('无重复视频')\n",
"else:\n",
" duplicate_video = [v.split('/')[-1][:-7] for v in q4_list]\n",
" print('有重复视频')\n",
" print()\n",
" print(duplicate_video)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e9542932",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment