Looking at the open source CFXMLCreateStringByUnescapingEntities(), I’d say the 
code was written to an unrealistic deadline and never reviewed.  They scan 
through the string, looking for the '&', then the ‘#', then the digits, and 
finally scan past the ‘;' up to the end of the string if necessary, but they 
don’t back out if the ';' is not found.

Regarding that this might be intentional leniency, I don’t think so.  There are 
quite a few comments, but none of them mention being lenient.  Furthermore, it 
doesn’t work as such lenience would intend.  Unicode 8224 is the dagger symbol. 
 If I omit the ‘;’ and give it:
        Hello&#8224World
it returns:
        Hello†

I’ve now submitted this as Bug ID #16424156, which includes a demo project that 
has the original open source function, and a patched version which seems to 
work.  Some serious testing is needed, but no more time for this issue today.  
The code in the demo project is also pasted in below, for anyone would like to 
review the patch.  Criticisms and/or suggested test cases would be appreciated!

On 2014 Mar 25, at 13:03, Quincey Morris <quinceymor...@rivergatesoftware.com> 
wrote:

> “accepting … without validation” meant, in this context, setting the NSString 
> as the value of a Core Data property.  The underlying problem is that 
> NSString objects are (in general, AFAIK) merely sequences of UTF-16 code 
> units, not sequences of *valid* UTF-16 code units, so that there are valid 
> NSStrings that aren’t valid Unicode. 

Quincey, thank you for this, because your initial explanation left me confused. 
 I get it.  This is quite astounding.  Anyone, say, downloading strings from a 
server, and inserting them into a Core Data store, should take heed.

I found several alternatives to CFXMLCreateStringByUnescapingEntities() listed 
here

http://stackoverflow.com/questions/659602/objective-c-html-escape-unescape

a couple of which I’ve used in the past, with issues, but remember deciding 
that CFXMLCreateStringByUnescapingEntities() was the best, most high-level 
method, reviewed and field-tested over many years by Apple :(  I’m worried that 
changing to one of these other methods in a production app will expose some 
*other* corner case bug, so I’m leaning toward using my patched 
CFXMLCreateStringByUnescapingEntities().


**** main.m *****

#import <Foundation/Foundation.h>

// The following is copied from 
http://www.opensource.apple.com/source/CF/CF-550/CFXMLParser.c
// except for the first two lines.
CFStringRef CFXMLCreateStringByUnescapingEntities(CFAllocatorRef allocator, 
CFStringRef string, CFDictionaryRef entitiesDictionary) {
    // Can't find CFAssert1, oh well.  JK 2014-03-25
    // CFAssert1(string != NULL, __kCFLogAssertion, "%s(): NULL string not 
permitted.", __PRETTY_FUNCTION__);
    
    CFStringInlineBuffer inlineBuf; /* use this for fast traversal of the 
string in question */
    CFStringRef sub;
    CFIndex lastChunkStart, length = CFStringGetLength(string);
    CFIndex i, entityStart;
    UniChar uc;
    UInt32 entity;
    int base;
    CFMutableDictionaryRef fullReplDict = entitiesDictionary ? 
CFDictionaryCreateMutableCopy(allocator, 0, entitiesDictionary) : 
CFDictionaryCreateMutable(allocator, 0, &kCFTypeDictionaryKeyCallBacks, 
&kCFTypeDictionaryValueCallBacks);
    
    CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("amp"), (const void 
*)CFSTR("&"));
    CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("quot"), (const void 
*)CFSTR("\""));
    CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("lt"), (const void 
*)CFSTR("<"));
    CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("gt"), (const void 
*)CFSTR(">"));
    CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("apos"), (const void 
*)CFSTR("'"));
    
    CFStringInitInlineBuffer(string, &inlineBuf, CFRangeMake(0, length - 1));
    CFMutableStringRef newString = CFStringCreateMutable(allocator, 0);
    
    lastChunkStart = 0;
    // Scan through the string in its entirety
    for(i = 0; i < length; ) {
        uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++;  // grab 
the next character and move i.
        
        if(uc == '&') {
            entityStart = i - 1;
            entity = 0xFFFF;    // set this to a not-Unicode character as 
sentinel
            // we've hit the beginning of an entity. Copy everything from 
lastChunkStart to this point.
            if(lastChunkStart < i - 1) {
                sub = CFStringCreateWithSubstring(allocator, string, 
CFRangeMake(lastChunkStart, (i - 1) - lastChunkStart));
                CFStringAppend(newString, sub);
                CFRelease(sub);
            }
            
            uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++;      
// grab the next character and move i.
            // Now we can process the entity reference itself
            if(uc == '#') {     // this is a numeric entity.
                base = 10;
                entity = 0;
                uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++;
                
                if(uc == 'x') { // only lowercase x allowed. Translating 
numeric entity as hexadecimal.
                    base = 16;
                    uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); 
i++;
                }
                
                // process the provided digits 'til we're finished
                while(true) {
                    if (uc >= '0' && uc <= '9')
                        entity = entity * base + (uc-'0');
                    else if (uc >= 'a' && uc <= 'f' && base == 16)
                        entity = entity * base + (uc-'a'+10);
                    else if (uc >= 'A' && uc <= 'F' && base == 16)
                        entity = entity * base + (uc-'A'+10);
                    else break;
                    
                    if (i < length) {
                        uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, 
i); i++;
                    }
                    else
                        break;
                }
            }
            
            // Scan to the end of the entity
            while(uc != ';' && i < length) {
                uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++;
            }
            
            if(0xFFFF != entity) { // it was numeric, and translated.
                // Now, output the result fo the entity
                if(entity >= 0x10000) {
                    UniChar characters[2] = { ((entity - 0x10000) >> 10) + 
0xD800, ((entity - 0x10000) & 0x3ff) + 0xDC00 };
                    CFStringAppendCharacters(newString, characters, 2);
                } else {
                    UniChar character = entity;
                    CFStringAppendCharacters(newString, &character, 1);
                }
            } else {    // it wasn't numeric.
                sub = CFStringCreateWithSubstring(allocator, string, 
CFRangeMake(entityStart + 1, (i - entityStart - 2))); // This trims off the & 
and ; from the string, so we can use it against the dictionary itself.
                CFStringRef replacementString = 
(CFStringRef)CFDictionaryGetValue(fullReplDict, sub);
                if(replacementString) {
                    CFStringAppend(newString, replacementString);
                } else {
                    CFRelease(sub); // let the old substring go, since we 
didn't find it in the dictionary
                    sub =  CFStringCreateWithSubstring(allocator, string, 
CFRangeMake(entityStart, (i - entityStart))); // create a new one, including 
the & and ;
                    CFStringAppend(newString, sub); // ...and append that.
                }
                CFRelease(sub); // in either case, release the most-recent "sub"
            }
            
            // move the lastChunkStart to the beginning of the next chunk.
            lastChunkStart = i;
        }
    }
    if(lastChunkStart < length) { // we've come out of the loop, let's get the 
rest of the string and tack it on.
        sub = CFStringCreateWithSubstring(allocator, string, 
CFRangeMake(lastChunkStart, i - lastChunkStart));
        CFStringAppend(newString, sub);
        CFRelease(sub);
    }
    
    CFRelease(fullReplDict);
    
    return newString;
}

// Patched version of the above
CFStringRef PatchedCFXMLCreateStringByUnescapingEntities(CFAllocatorRef 
allocator, CFStringRef string, CFDictionaryRef entitiesDictionary) {
    // Can't find CFAssert1, oh well.  JK 2014-03-25
    // CFAssert1(string != NULL, __kCFLogAssertion, "%s(): NULL string not 
permitted.", __PRETTY_FUNCTION__);

    CFStringInlineBuffer inlineBuf; /* use this for fast traversal of the 
string in question */
    CFStringRef sub;
    CFIndex lastChunkStart, length = CFStringGetLength(string);
    CFIndex i, entityStart;
    UniChar uc;
    UInt32 entity;
    int base;
    CFMutableDictionaryRef fullReplDict = entitiesDictionary ? 
CFDictionaryCreateMutableCopy(allocator, 0, entitiesDictionary) : 
CFDictionaryCreateMutable(allocator, 0, &kCFTypeDictionaryKeyCallBacks, 
&kCFTypeDictionaryValueCallBacks);
    
    CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("amp"), (const void 
*)CFSTR("&"));
    CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("quot"), (const void 
*)CFSTR("\""));
    CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("lt"), (const void 
*)CFSTR("<"));
    CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("gt"), (const void 
*)CFSTR(">"));
    CFDictionaryAddValue(fullReplDict, (const void *)CFSTR("apos"), (const void 
*)CFSTR("'"));
    
    CFStringInitInlineBuffer(string, &inlineBuf, CFRangeMake(0, length));
    // The above range was length-1, but that misses the ';' in case the
    // subject string ends in a numeric HTML entity.  So I removed the "-1"
    CFMutableStringRef newString = CFStringCreateMutable(allocator, 0);
    
    lastChunkStart = 0;
    // Scan through the string in its entirety
    for(i = 0; i < length; ) {
        uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++;  // grab 
the next character and move i.
        
        if(uc == '&') {
            entityStart = i - 1;
            entity = 0xFFFF;    // set this to a not-Unicode character as 
sentinel
            // We may have hit the beginning of an entity. Copy everything from 
lastChunkStart to this point.
            if(lastChunkStart < i - 1) {
                sub = CFStringCreateWithSubstring(allocator, string, 
CFRangeMake(lastChunkStart, (i - 1) - lastChunkStart));
                CFStringAppend(newString, sub);
                CFRelease(sub);
            }
            
            uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++;      
// grab the next character and move i.
            // Now we can process the entity reference itself
            if(uc == '#') {     // If this turns out to be an entity, it is a 
numeric entity.
                base = 10;
                entity = 0;
                uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++;
                
                if(uc == 'x') { // only lowercase x allowed. Translating 
numeric entity as hexadecimal.
                    base = 16;
                    uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); 
i++;
                }
                
                // process the provided digits 'til we're finished
                while(true) {
                    if (uc >= '0' && uc <= '9')
                        entity = entity * base + (uc-'0');
                    else if (uc >= 'a' && uc <= 'f' && base == 16)
                        entity = entity * base + (uc-'a'+10);
                    else if (uc >= 'A' && uc <= 'F' && base == 16)
                        entity = entity * base + (uc-'A'+10);
                    else break;
                    
                    if (i < length) {
                        uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, 
i); i++;
                    }
                    else
                        break;
                }
            }
            
            while(uc != ';' && i < length) {
                uc = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i); i++;
            }
            
            if (uc == ';') {
                if(0xFFFF != entity) { // it was numeric, and translated.
                    // Now, output the result fo the entity
                    if(entity >= 0x10000) {
                        UniChar characters[2] = { ((entity - 0x10000) >> 10) + 
0xD800, ((entity - 0x10000) & 0x3ff) + 0xDC00 };
                        CFStringAppendCharacters(newString, characters, 2);
                    } else {
                        UniChar character = entity;
                        CFStringAppendCharacters(newString, &character, 1);
                    }
                } else {        // it wasn't numeric.
                    sub = CFStringCreateWithSubstring(allocator, string, 
CFRangeMake(entityStart + 1, (i - entityStart - 2))); // This trims off the & 
and ; from the string, so we can use it against the dictionary itself.
                    CFStringRef replacementString = 
(CFStringRef)CFDictionaryGetValue(fullReplDict, sub);
                    if(replacementString) {
                        CFStringAppend(newString, replacementString);
                    } else {
                        CFRelease(sub); // let the old substring go, since we 
didn't find it in the dictionary
                        sub =  CFStringCreateWithSubstring(allocator, string, 
CFRangeMake(entityStart, (i - entityStart))); // create a new one, including 
the & and ;
                        CFStringAppend(newString, sub); // ...and append that.
                    }
                    CFRelease(sub); // in either case, release the most-recent 
"sub"
                }
            }
            else {
                CFStringRef sub1 =  CFStringCreateWithSubstring(allocator, 
string, CFRangeMake(entityStart, (i - entityStart))); // create a new one, 
including the & and ;
                CFStringAppend(newString, sub1) ;
            }
            
            // move the lastChunkStart to the beginning of the next chunk.
            lastChunkStart = i;
        }
    }
    if(lastChunkStart < length) { // we've come out of the loop, let's get the 
rest of the string and tack it on.
        sub = CFStringCreateWithSubstring(allocator, string, 
CFRangeMake(lastChunkStart, i - lastChunkStart));
        CFStringAppend(newString, sub);
        CFRelease(sub);
    }
    
    CFRelease(fullReplDict);

    return newString;
}

NSString* ValidateString(NSString* s) {
    const char* cString = [s UTF8String] ;
    return (cString != NULL) ? s : nil ;
}

void TestString(NSString* s) {
    NSLog(@"Testing:         %@", s) ;
    
    // Try Apple's function
    NSString* result1 = (__bridge 
NSString*)CFXMLCreateStringByUnescapingEntities(NULL, (__bridge CFStringRef)s, 
NULL) ;
    result1 = ValidateString(result1) ;
    NSLog(@"   Apple Result: %@", result1) ;
    
    // Try patched function
    NSString* result2 = (__bridge 
NSString*)PatchedCFXMLCreateStringByUnescapingEntities(NULL, (__bridge 
CFStringRef)s, NULL) ;
    result2 = ValidateString(result2) ;
    NSLog(@" Patched Result: %@", result2) ;
}

int main(int argc, const char * argv[])
{

    @autoreleasepool {
        TestString(@"Hello&#160;World") ;
        TestString(@"Here is a gamma: &#915;.  Here is a dagger: &#8224;") ;
        TestString(@"Hello&#8224World") ;
        TestString(@"&#13207494") ;
        
    }
    return 0;
}




_______________________________________________

Cocoa-dev mailing list (Cocoa-dev@lists.apple.com)

Please do not post admin requests or moderator comments to the list.
Contact the moderators at cocoa-dev-admins(at)lists.apple.com

Help/Unsubscribe/Update your Subscription:
https://lists.apple.com/mailman/options/cocoa-dev/archive%40mail-archive.com

This email sent to arch...@mail-archive.com

Reply via email to