Created
July 18, 2024 06:53
-
-
Save woshichuanqilz/76c176cbbc6fd96e4ec8dc9b381c08e4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# 检查是否提供了文件名 | |
if [ "$#" -ne 1 ]; then | |
echo "Usage: $0 <filename>" | |
exit 1 | |
fi | |
filename=$1 | |
# 使用sed处理文件 | |
# 1. 删除所有包含 '-->' 的行(时间码) | |
# 2. 删除空行 | |
# 3. 删除所有以 'WEBVTT'、'Kind:'、'Language:' 开头的行 | |
# 4. 删除所有以 'align:' 开头的行 | |
# 5. 删除所有的标签及其内容 | |
# 6. 删除所有的HTML标签(如果有的话) | |
sed '/-->/d' "$filename" | | |
sed '/^$/d' | | |
sed '/^WEBVTT/d' | | |
sed '/^Kind:/d' | | |
sed '/^Language:/d' | | |
sed '/^align:/d' | | |
sed 's/<[^>]*>//g' | | |
sed 's/<[^>]*>.*<\/[^>]*>//g' | | |
sed '/^\s*$/d' | | |
uniq >extracted_text.txt | |
echo "Text extracted to extracted_text.txt" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment