Looking at the open source CFXMLCreateStringByUnescapingEntities(), I’d say the code was written to an unrealistic deadline and never reviewed. They scan through the string, looking for the '&', then the ‘#', then the digits, and finally scan past the ‘;' up to the end of the string if necessary, but they don’t back out if the ';' is not found.
Regarding that this might be intentional leniency, I don’t think so. There are quite a few comments, but none of them mention being lenient. Furthermore, it doesn’t work as such lenience would intend. Unicode 8224 is the dagger symbol. If I omit the ‘;’ and give it: Hello†World it returns: Hello† I’ve now submitted this as Bug ID #16424156, which includes a demo project that has the original open source function, and a patched version which seems to work. Some serious testing is needed, but no more time for this issue today. The code in the demo project is also pasted in below, for anyone would like to review the patch. Criticisms and/or suggested test cases would be appreciated! On 2014 Mar 25, at 13:03, Quincey Morris <quinceymor...@rivergatesoftware.com> wrote: > “accepting … without validation” meant, in this context, setting the NSString > as the value of a Core Data property. The underlying problem is that > NSString objects are (in general, AFAIK) merely sequences of UTF-16 code > units, not sequences of *valid* UTF-16 code units, so that there are valid > NSStrings that aren’t valid Unicode. Quincey, thank you for this, because your initial explanation left me confused. I get it. This is quite astounding. Anyone, say, downloading strings from a server, and inserting them into a Core Data store, should take heed. I found several alternatives to CFXMLCreateStringByUnescapingEntities() listed here http://stackoverflow.com/questions/659602/objective-c-html-escape-unescape a couple of which I’ve used in the past, with issues, but remember deciding that CFXMLCreateStringByUnescapingEntities() was the best, most high-level method, reviewed and field-tested over many years by Apple :( I’m worried that changing to one of these other methods in a production app will expose some *other* corner case bug, so I’m leaning toward using my patched CFXMLCreateStringByUnescapingEntities(). **** main.m ***** #import <Foundation/Foundation.h> // The following is copied from http://www.opensource.apple.com/source/CF/CF-550/CFXMLParser.c // except for the first two lines. CFStringRef CFXMLCreateStringByUnescapingEntities(CFAllocatorRef allocator, CFStringRef string, CFDictionaryRef entitiesDictionary) { // Can't find CFAssert1, oh well. JK 2014-03-25 // CFAssert1(string != NULL, __kCFLogAssertion, "%s(): NULL string not permitted.", __PRETTY_FUNCTION__); CFStringInlineBuffer inlineBuf; /* use this for fast traversal of the string in question */ CFStringRef sub; CFIndex lastChunkStart, length = CFStringGetLength(string); CFIndex i, entityStart; UniChar uc; UInt32 entity; int base; CFMutableDictionaryRef fullReplDict = entitiesDictionary ? CFDictionaryCreateMutableCopy(allocator, 0, entitiesDictionary) : CFDictionaryCreateMutable(allocator, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks); CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("amp"), (const void *)CFSTR("&")); CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("quot"), (const void *)CFSTR("\"")); CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("lt"), (const void *)CFSTR("<")); CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("gt"), (const void *)CFSTR(">")); CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("apos"), (const void *)CFSTR("'")); CFStringInitInlineBuffer(string, &inlineBuf, CFRangeMake(0, length - 1)); CFMutableStringRef newString = CFStringCreateMutable(allocator, 0); lastChunkStart = 0; // Scan through the string in its entirety for(i = 0; i < length; ) { uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; // grab the next character and move i. if(uc == '&') { entityStart = i - 1; entity = 0xFFFF; // set this to a not-Unicode character as sentinel // we've hit the beginning of an entity. Copy everything from lastChunkStart to this point. if(lastChunkStart < i - 1) { sub = CFStringCreateWithSubstring(allocator, string, CFRangeMake(lastChunkStart, (i - 1) - lastChunkStart)); CFStringAppend(newString, sub); CFRelease(sub); } uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; // grab the next character and move i. // Now we can process the entity reference itself if(uc == '#') { // this is a numeric entity. base = 10; entity = 0; uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; if(uc == 'x') { // only lowercase x allowed. Translating numeric entity as hexadecimal. base = 16; uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; } // process the provided digits 'til we're finished while(true) { if (uc >= '0' && uc <= '9') entity = entity * base + (uc-'0'); else if (uc >= 'a' && uc <= 'f' && base == 16) entity = entity * base + (uc-'a'+10); else if (uc >= 'A' && uc <= 'F' && base == 16) entity = entity * base + (uc-'A'+10); else break; if (i < length) { uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; } else break; } } // Scan to the end of the entity while(uc != ';' && i < length) { uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; } if(0xFFFF != entity) { // it was numeric, and translated. // Now, output the result fo the entity if(entity >= 0x10000) { UniChar characters[2] = { ((entity - 0x10000) >> 10) + 0xD800, ((entity - 0x10000) & 0x3ff) + 0xDC00 }; CFStringAppendCharacters(newString, characters, 2); } else { UniChar character = entity; CFStringAppendCharacters(newString, &character, 1); } } else { // it wasn't numeric. sub = CFStringCreateWithSubstring(allocator, string, CFRangeMake(entityStart + 1, (i - entityStart - 2))); // This trims off the & and ; from the string, so we can use it against the dictionary itself. CFStringRef replacementString = (CFStringRef)CFDictionaryGetValue(fullReplDict, sub); if(replacementString) { CFStringAppend(newString, replacementString); } else { CFRelease(sub); // let the old substring go, since we didn't find it in the dictionary sub = CFStringCreateWithSubstring(allocator, string, CFRangeMake(entityStart, (i - entityStart))); // create a new one, including the & and ; CFStringAppend(newString, sub); // ...and append that. } CFRelease(sub); // in either case, release the most-recent "sub" } // move the lastChunkStart to the beginning of the next chunk. lastChunkStart = i; } } if(lastChunkStart < length) { // we've come out of the loop, let's get the rest of the string and tack it on. sub = CFStringCreateWithSubstring(allocator, string, CFRangeMake(lastChunkStart, i - lastChunkStart)); CFStringAppend(newString, sub); CFRelease(sub); } CFRelease(fullReplDict); return newString; } // Patched version of the above CFStringRef PatchedCFXMLCreateStringByUnescapingEntities(CFAllocatorRef allocator, CFStringRef string, CFDictionaryRef entitiesDictionary) { // Can't find CFAssert1, oh well. JK 2014-03-25 // CFAssert1(string != NULL, __kCFLogAssertion, "%s(): NULL string not permitted.", __PRETTY_FUNCTION__); CFStringInlineBuffer inlineBuf; /* use this for fast traversal of the string in question */ CFStringRef sub; CFIndex lastChunkStart, length = CFStringGetLength(string); CFIndex i, entityStart; UniChar uc; UInt32 entity; int base; CFMutableDictionaryRef fullReplDict = entitiesDictionary ? CFDictionaryCreateMutableCopy(allocator, 0, entitiesDictionary) : CFDictionaryCreateMutable(allocator, 0, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks); CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("amp"), (const void *)CFSTR("&")); CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("quot"), (const void *)CFSTR("\"")); CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("lt"), (const void *)CFSTR("<")); CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("gt"), (const void *)CFSTR(">")); CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("apos"), (const void *)CFSTR("'")); CFStringInitInlineBuffer(string, &inlineBuf, CFRangeMake(0, length)); // The above range was length-1, but that misses the ';' in case the // subject string ends in a numeric HTML entity. So I removed the "-1" CFMutableStringRef newString = CFStringCreateMutable(allocator, 0); lastChunkStart = 0; // Scan through the string in its entirety for(i = 0; i < length; ) { uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; // grab the next character and move i. if(uc == '&') { entityStart = i - 1; entity = 0xFFFF; // set this to a not-Unicode character as sentinel // We may have hit the beginning of an entity. Copy everything from lastChunkStart to this point. if(lastChunkStart < i - 1) { sub = CFStringCreateWithSubstring(allocator, string, CFRangeMake(lastChunkStart, (i - 1) - lastChunkStart)); CFStringAppend(newString, sub); CFRelease(sub); } uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; // grab the next character and move i. // Now we can process the entity reference itself if(uc == '#') { // If this turns out to be an entity, it is a numeric entity. base = 10; entity = 0; uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; if(uc == 'x') { // only lowercase x allowed. Translating numeric entity as hexadecimal. base = 16; uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; } // process the provided digits 'til we're finished while(true) { if (uc >= '0' && uc <= '9') entity = entity * base + (uc-'0'); else if (uc >= 'a' && uc <= 'f' && base == 16) entity = entity * base + (uc-'a'+10); else if (uc >= 'A' && uc <= 'F' && base == 16) entity = entity * base + (uc-'A'+10); else break; if (i < length) { uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; } else break; } } while(uc != ';' && i < length) { uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++; } if (uc == ';') { if(0xFFFF != entity) { // it was numeric, and translated. // Now, output the result fo the entity if(entity >= 0x10000) { UniChar characters[2] = { ((entity - 0x10000) >> 10) + 0xD800, ((entity - 0x10000) & 0x3ff) + 0xDC00 }; CFStringAppendCharacters(newString, characters, 2); } else { UniChar character = entity; CFStringAppendCharacters(newString, &character, 1); } } else { // it wasn't numeric. sub = CFStringCreateWithSubstring(allocator, string, CFRangeMake(entityStart + 1, (i - entityStart - 2))); // This trims off the & and ; from the string, so we can use it against the dictionary itself. CFStringRef replacementString = (CFStringRef)CFDictionaryGetValue(fullReplDict, sub); if(replacementString) { CFStringAppend(newString, replacementString); } else { CFRelease(sub); // let the old substring go, since we didn't find it in the dictionary sub = CFStringCreateWithSubstring(allocator, string, CFRangeMake(entityStart, (i - entityStart))); // create a new one, including the & and ; CFStringAppend(newString, sub); // ...and append that. } CFRelease(sub); // in either case, release the most-recent "sub" } } else { CFStringRef sub1 = CFStringCreateWithSubstring(allocator, string, CFRangeMake(entityStart, (i - entityStart))); // create a new one, including the & and ; CFStringAppend(newString, sub1) ; } // move the lastChunkStart to the beginning of the next chunk. lastChunkStart = i; } } if(lastChunkStart < length) { // we've come out of the loop, let's get the rest of the string and tack it on. sub = CFStringCreateWithSubstring(allocator, string, CFRangeMake(lastChunkStart, i - lastChunkStart)); CFStringAppend(newString, sub); CFRelease(sub); } CFRelease(fullReplDict); return newString; } NSString* ValidateString(NSString* s) { const char* cString = [s UTF8String] ; return (cString != NULL) ? s : nil ; } void TestString(NSString* s) { NSLog(@"Testing: %@", s) ; // Try Apple's function NSString* result1 = (__bridge NSString*)CFXMLCreateStringByUnescapingEntities(NULL, (__bridge CFStringRef)s, NULL) ; result1 = ValidateString(result1) ; NSLog(@" Apple Result: %@", result1) ; // Try patched function NSString* result2 = (__bridge NSString*)PatchedCFXMLCreateStringByUnescapingEntities(NULL, (__bridge CFStringRef)s, NULL) ; result2 = ValidateString(result2) ; NSLog(@" Patched Result: %@", result2) ; } int main(int argc, const char * argv[]) { @autoreleasepool { TestString(@"Hello World") ; TestString(@"Here is a gamma: Γ. Here is a dagger: †") ; TestString(@"Hello†World") ; TestString(@"�") ; } return 0; } _______________________________________________ Cocoa-dev mailing list (Cocoa-dev@lists.apple.com) Please do not post admin requests or moderator comments to the list. Contact the moderators at cocoa-dev-admins(at)lists.apple.com Help/Unsubscribe/Update your Subscription: https://lists.apple.com/mailman/options/cocoa-dev/archive%40mail-archive.com This email sent to arch...@mail-archive.com