From 65abcb4c543561289737d21ddd49928407fd36bf Mon Sep 17 00:00:00 2001 From: Wincent Colaiuta Date: Mon, 11 May 2009 21:11:12 +0200 Subject: [PATCH] Reformat _Wikitext_utf8_to_utf32 for better readability Reduce line lengths to make the _Wikitext_utf8_to_utf32 function more readable, most notably by splitting lengthy condition expressions and bitwise-OR expressions across multiple lines. Signed-off-by: Wincent Colaiuta --- ext/parser.c | 86 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 60 insertions(+), 26 deletions(-) diff --git a/ext/parser.c b/ext/parser.c index 2dff381..379fd93 100644 --- a/ext/parser.c +++ b/ext/parser.c @@ -618,52 +618,86 @@ void _Wikitext_pop_excess_elements(parser_t *parser) #define INVALID_ENCODING(msg) do { if (dest_ptr) free(dest_ptr); rb_raise(eWikitextParserError, "invalid encoding: " msg); } while(0) -// convert a single UTF-8 codepoint to UTF-32 -// expects an input buffer, src, containing a UTF-8 encoded character (which may be multi-byte) -// the end of the input buffer, end, is also passed in to allow the detection of invalidly truncated codepoints -// the number of bytes in the UTF-8 character (between 1 and 4) is returned by reference in width_out -// raises a RangeError if the supplied character is invalid UTF-8 -// (in which case it also frees the block of memory indicated by dest_ptr if it is non-NULL) +// Convert a single UTF-8 codepoint to UTF-32 +// +// Expects an input buffer, src, containing a UTF-8 encoded character (which +// may be multi-byte). The end of the input buffer, end, is also passed in to +// allow the detection of invalidly truncated codepoints. The number of bytes +// in the UTF-8 character (between 1 and 4) is returned by reference in +// width_out. +// +// Raises a RangeError if the supplied character is invalid UTF-8 (in which +// case it also frees the block of memory indicated by dest_ptr if it is +// non-NULL). uint32_t _Wikitext_utf8_to_utf32(char *src, char *end, long *width_out, void *dest_ptr) { uint32_t dest; - if ((unsigned char)src[0] <= 0x7f) // ASCII + if ((unsigned char)src[0] <= 0x7f) { + // ASCII dest = src[0]; *width_out = 1; } - else if ((src[0] & 0xe0) == 0xc0) // byte starts with 110..... : this should be a two-byte sequence + else if ((src[0] & 0xe0) == 0xc0) { + // byte starts with 110..... : this should be a two-byte sequence if (src + 1 >= end) - INVALID_ENCODING("truncated byte sequence"); // no second byte - else if (((unsigned char)src[0] == 0xc0) || ((unsigned char)src[0] == 0xc1)) - INVALID_ENCODING("overlong encoding"); // overlong encoding: lead byte of 110..... but code point <= 127 + // no second byte + INVALID_ENCODING("truncated byte sequence"); + else if (((unsigned char)src[0] == 0xc0) || + ((unsigned char)src[0] == 0xc1)) + // overlong encoding: lead byte of 110..... but code point <= 127 + INVALID_ENCODING("overlong encoding"); else if ((src[1] & 0xc0) != 0x80 ) - INVALID_ENCODING("malformed byte sequence"); // should have second byte starting with 10...... - dest = ((uint32_t)(src[0] & 0x1f)) << 6 | (src[1] & 0x3f); + // should have second byte starting with 10...... + INVALID_ENCODING("malformed byte sequence"); + + dest = + ((uint32_t)(src[0] & 0x1f)) << 6 | + (src[1] & 0x3f); *width_out = 2; } - else if ((src[0] & 0xf0) == 0xe0) // byte starts with 1110.... : this should be a three-byte sequence + else if ((src[0] & 0xf0) == 0xe0) { + // byte starts with 1110.... : this should be a three-byte sequence if (src + 2 >= end) - INVALID_ENCODING("truncated byte sequence"); // missing second or third byte - else if (((src[1] & 0xc0) != 0x80 ) || ((src[2] & 0xc0) != 0x80 )) - INVALID_ENCODING("malformed byte sequence"); // should have second and third bytes starting with 10...... - dest = ((uint32_t)(src[0] & 0x0f)) << 12 | ((uint32_t)(src[1] & 0x3f)) << 6 | (src[2] & 0x3f); + // missing second or third byte + INVALID_ENCODING("truncated byte sequence"); + else if (((src[1] & 0xc0) != 0x80 ) || + ((src[2] & 0xc0) != 0x80 )) + // should have second and third bytes starting with 10...... + INVALID_ENCODING("malformed byte sequence"); + + dest = + ((uint32_t)(src[0] & 0x0f)) << 12 | + ((uint32_t)(src[1] & 0x3f)) << 6 | + (src[2] & 0x3f); *width_out = 3; } - else if ((src[0] & 0xf8) == 0xf0) // bytes starts with 11110... : this should be a four-byte sequence + else if ((src[0] & 0xf8) == 0xf0) { + // bytes starts with 11110... : this should be a four-byte sequence if (src + 3 >= end) - INVALID_ENCODING("truncated byte sequence"); // missing second, third, or fourth byte - else if ((unsigned char)src[0] >= 0xf5 && (unsigned char)src[0] <= 0xf7) - INVALID_ENCODING("overlong encoding"); // disallowed by RFC 3629 (codepoints above 0x10ffff) - else if (((src[1] & 0xc0) != 0x80 ) || ((src[2] & 0xc0) != 0x80 ) || ((src[3] & 0xc0) != 0x80 )) - INVALID_ENCODING("malformed byte sequence"); // should have second and third bytes starting with 10...... - dest = ((uint32_t)(src[0] & 0x07)) << 18 | ((uint32_t)(src[1] & 0x3f)) << 12 | ((uint32_t)(src[1] & 0x3f)) << 6 | (src[2] & 0x3f); + // missing second, third, or fourth byte + INVALID_ENCODING("truncated byte sequence"); + else if ((unsigned char)src[0] >= 0xf5 && + (unsigned char)src[0] <= 0xf7) + // disallowed by RFC 3629 (codepoints above 0x10ffff) + INVALID_ENCODING("overlong encoding"); + else if (((src[1] & 0xc0) != 0x80 ) || + ((src[2] & 0xc0) != 0x80 ) || + ((src[3] & 0xc0) != 0x80 )) + // should have second and third bytes starting with 10...... + INVALID_ENCODING("malformed byte sequence"); + + dest = + ((uint32_t)(src[0] & 0x07)) << 18 | + ((uint32_t)(src[1] & 0x3f)) << 12 | + ((uint32_t)(src[1] & 0x3f)) << 6 | + (src[2] & 0x3f); *width_out = 4; } - else // invalid input + else INVALID_ENCODING("unexpected byte"); return dest; } -- 2.40.1