Last active
April 8, 2021 10:44
-
-
Save tempelmann/8f5232550c55a9d9640e5eb7cb3d7cd8 to your computer and use it in GitHub Desktop.
Demonstrates that `[[NSURL fileURLWithPath:path] path]` may return a different string than was passed
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Created by Thomas Tempelmann on 6 Apr 21, updated 8 Apr 21. | |
// | |
// This code demonstrates a unicode normalization issue with reading file system paths on APFS | |
// volumes in macOS. | |
// | |
// TL;DR – In short, NSURL.path return paths (probably) always in NFD form even if the on-disk | |
// name is in NFC form, and even if you passed the on-disk path to NSURL to create it. | |
// Other path accessors, such as the canonical path, as well as BSD/POSIX functions | |
// will keep giving you the on-disk name. | |
// As a result, the name you get from NSURLNameKey may not be identical to the URL's | |
// path.lastPathComponent | |
// | |
// The issue is that the actual path, when passed to [NSURL fileURLWithPath:], may lose its | |
// original representation if the path uses precomposed unicode characters (mainly affects | |
// latin scripts such as French, German, Turkish). | |
// | |
// In other words: While paths you get from the lower level BSD/POSIX APIs may give you the | |
// original (raw) name as it's stored on-disk, NSURL and NSFileManager will give you a name | |
// adjusted better for use in macOS, where decomposed characters are the default (Windows and | |
// Linux prefer precomposed). So, if you'd get the file name via NSURL and then create a new | |
// file of the same name, it may use a different composition, which in turn may confuse | |
// software that isn't prepared for this, e.g. matching file names by raw string comparison. | |
// | |
// While such precomposed file names are fairly unusual in macOS, because many APIs decompose | |
// such file names before creating files, they may still be created by applications, terminal | |
// commands, shell scripts, or when using other file systems, such as NFS. | |
// | |
// In fact, Apple's own iTunes (now "Music") app will create such folder names as it bases them | |
// on the album and artist names stored in the audio files (e.g. as mp3 tags), even on | |
// macOS 11.2.3 - a sample file that contains a precomposed form of the turkish "ğ" | |
// is available here: | |
// http://files.tempel.org/Various/macOS_CanonicalPath_UnicodeNormalization/precomposed_album_name.mp3 | |
// Simply import the file into the Music app and then locate its file in the iTunes folder. | |
// Then check its parent folder's name's composition, e.g. with this shell command: | |
// % ls ~/Music/Music/Media.localized/Thomas\ Tempelmann | xxd | |
// This will print: | |
// 00000000: 7072 6563 6f6d 706f 7365 645f c49f 0a precomposed_... | |
// Note that the bytes at the end of the file name for "ğ" are C4 9F in UTF-8, which is the | |
// precomposed form (U+011F), whereas the decomposed bytes, as preferred by macOS, are 67 CC 86. | |
// While this example file was constructed by me, I've seen mp3 files with those precomposed | |
// name encodings from various sources "in the wild". Basically, any audio file for which the | |
// tags were created on a Windows system are likely to use precomposed characters. | |
// | |
// How do other file systems behave in this regard? | |
// | |
// HFS+ is not affected by this irregular behavior because HFS+ always converts file names to | |
// NFD before storing them on-disk. This means, you cannot preserve NFC names in HFS+ the way | |
// you can in APFS (which is normalization-preserving and -agnostic). | |
// | |
// SMB is an especially tricky one: As such servers usually run on Linux or Windows, and these | |
// systems prefer NFC, on-disk file names are usually in that form as well (but not necessarily). | |
// However, Linux (not sure about Win) is not normalization-insensitive (APFS is), which means you | |
// could actually have two files named "ü" in the same directory, by differing in their normalization. | |
// To deal with this preference of NFC, macOS's SMB client blindly assumes (it has no other choice) | |
// that names are always using NFC on an SMB server. Therefore, when it asks the server to lookup a | |
// file name, it will pass the path always in NFC form. (And when it reads a dir, it returns the | |
// actual form). | |
// The problem arises when the file name on the server is in the unexpected NFD form - then the lookup | |
// will fail. | |
// This can, for instance, happen if one accesses the same server via NFS - then there's no normalization | |
// happening like it's done in SMB, unless you pass the "nfc" option when mounting it (see Apple's | |
// reply in http://www.openradar.me/radar?id=4984843016339456). | |
// In such cases, where the on-disk name on a SMB share uses NFD, it won't be accessible from macOS | |
// any more. E.g, if it's a folder, then you cannot access its contents, not even with BSD/POSIX | |
// functions. | |
#import <Foundation/Foundation.h> | |
#include <dirent.h> | |
static NSString *tempDirPath; | |
static void checkname (NSString *name) | |
{ | |
// Create a file with precomposed name inside the test dir. | |
// We need to use the POSIX API because the higher level APIs normalize the path before creating the file. | |
NSString *p0 = [tempDirPath stringByAppendingPathComponent:name]; | |
open (p0.UTF8String, O_CREAT); | |
// Get the file's name using POSIX functions, by reading the dir entry from the file system | |
DIR *dir = opendir (tempDirPath.UTF8String); | |
struct dirent *entry; | |
while(1) { | |
entry = readdir (dir); | |
if (strcmp (entry->d_name, name.UTF8String) == 0) break; // found it | |
} | |
// Verify that the on-disk file name is indeed using the precomposed format | |
assert (entry != nil); | |
// Create a NSURL object from the path to the file | |
NSURL *url = [NSURL fileURLWithPath:p0]; | |
// Get the path of the url and check whether it matches our original path | |
NSString *p2 = url.path; | |
BOOL eq_path = [p0.lastPathComponent isEqualToString:p2.lastPathComponent]; // gives NO :( | |
assert (eq_path == NO); // [NSURL path] gives a different result than the original name is. | |
// Get the file name of the url | |
NSString *p3; | |
[url getResourceValue:&p3 forKey:NSURLNameKey error:nil]; | |
BOOL eq_name = [p0.lastPathComponent isEqualToString:p3]; // gives YES | |
assert (eq_name == YES); | |
// Get the canonical path | |
NSString *p4; | |
[url getResourceValue:&p4 forKey:NSURLCanonicalPathKey error:nil]; | |
BOOL eq_canonical = [p0.lastPathComponent isEqualToString:p4.lastPathComponent]; // gives YES | |
assert (eq_canonical == YES); | |
// Get the original path through its file Reference (which is based on the inode / fileID) | |
NSString *p5 = url.fileReferenceURL.filePathURL.path; | |
BOOL eq_fref = [p0.lastPathComponent isEqualToString:p5.lastPathComponent]; // gives YES | |
assert (eq_fref == YES); | |
} | |
int main(int argc, const char * argv[]) | |
{ | |
// The letter "ü", in precomposed form: | |
NSString *ue_precomp = @"\xC3\xBC"; // decomposed: @"u\xCC\x88"; | |
// The letter "ğ", in precomposed form: | |
NSString *gbreve_precomp = @"\xC4\x9F"; | |
// Create a test directory inside the tmp dir, named "precomposed" | |
tempDirPath = [NSTemporaryDirectory() stringByAppendingPathComponent:@"precomposed"]; | |
[NSFileManager.defaultManager createDirectoryAtPath:tempDirPath withIntermediateDirectories:YES attributes:0 error:nil]; | |
checkname(ue_precomp); | |
checkname(gbreve_precomp); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment