Created
June 17, 2024 12:02
-
-
Save ImN1/688b6698901a0866a5ccbe33819a9e5c to your computer and use it in GitHub Desktop.
grouped similar filenames
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def groupedSimilarFilenames(filenames): | |
''' | |
将相似文件名分组\n | |
输出类似格式\n | |
filenames preffix suffix size \n | |
0 cover.jpg NaN NaN 0 \n | |
1 top.png NaN NaN 0 \n | |
2 9.jpg NaN .jpg 0 \n | |
3 015a.jpg 0 a.jpg 1 \n | |
4 008.jpg 0 .jpg 9 \n | |
5 010.jpg 0 .jpg 9 \n | |
6 011.jpg 0 .jpg 9 \n | |
7 012.jpg 0 .jpg 9 \n | |
8 013.jpg 0 .jpg 9 \n | |
9 014.jpg 0 .jpg 9 \n | |
10 016.jpg 0 .jpg 9 \n | |
11 017.jpg 0 .jpg 9 \n | |
12 018.jpg 0 .jpg 9 \n | |
\n | |
size 为该项所在分组的成员个数\n | |
依据条件可筛选特别的文件名\n | |
''' | |
df = pd.DataFrame(filenames, columns=['filenames']) # 即使一维 filenames 也能直接变成竖向 | |
pattern = r'^(?P<preffix>.+\D)?(?P<number>\d+)(?P<suffix>.+)?' | |
# pattern = r'^(?P<preffix>.+?\D)?(?P<number>\d+)(?P<suffix>.+)?' | |
df1 = pd.concat([df, df['filenames'].str.extract(pattern, flags=re.IGNORECASE)], axis=1) | |
gf = ['preffix','suffix'] | |
if df1['preffix'].isna().all(): | |
gf = ['suffix'] | |
g = df1.groupby(gf, as_index=False) | |
df1['size'] = g.transform('size').fillna(0).astype(int) | |
return df1.sort_values('size', ascending=False, ignore_index=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment