Changeset 9a55e8384d5115113f5623b98d66660ae2a89972
- Timestamp:
- 15/12/06 15:16:08
(2 years ago)
- Author:
- Rémi Denis-Courmont <rem@videolan.org>
- git-committer:
- Rémi Denis-Courmont <rem@videolan.org> 1166192168 +0000
- git-parent:
[d2dff4466b7e125a3734a3e83f3f0e476dc623e8]
- git-author:
- Rémi Denis-Courmont <rem@videolan.org> 1166192168 +0000
- Message:
Simpler UTF-8 check functions + rudimentary unit test
-
Files:
-
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
| r6ed527e |
r9a55e83 |
|
| 364 | 364 | ############################################################################### |
|---|
| 365 | 365 | if USE_LIBTOOL |
|---|
| 366 | | check_PROGRAMS = test_i18n_atof test_url |
|---|
| | 366 | check_PROGRAMS = test_i18n_atof test_url test_utf8 |
|---|
| 367 | 367 | TESTS = $(check_PROGRAMS) |
|---|
| | 368 | endif |
|---|
| 368 | 369 | |
|---|
| 369 | 370 | CFLAGS_tests = `$(VLC_CONFIG) --cflags libvlc` |
|---|
| … | … | |
| 376 | 377 | test_url_LDADD = libvlc.la |
|---|
| 377 | 378 | test_url_CFLAGS = $(CFLAGS_tests) |
|---|
| 378 | | endif |
|---|
| | 379 | |
|---|
| | 380 | test_utf8_SOURCES = test/utf8.c |
|---|
| | 381 | test_utf8_LDADD = libvlc.la |
|---|
| | 382 | test_utf8_CFLAGS = $(CFLAGS_tests) |
|---|
| 379 | 383 | |
|---|
| 380 | 384 | FORCE: |
|---|
| r911511b |
r9a55e83 |
|
| 640 | 640 | |
|---|
| 641 | 641 | static char *CheckUTF8( char *str, char rep ) |
|---|
| 642 | | #define isutf8cont( c ) (((c) >= 0x80) && ((c) <= 0xBF)) |
|---|
| 643 | | { |
|---|
| 644 | | unsigned char *ptr, c; |
|---|
| 645 | | |
|---|
| | 642 | { |
|---|
| | 643 | uint8_t *ptr = (uint8_t *)str; |
|---|
| 646 | 644 | assert (str != NULL); |
|---|
| 647 | 645 | |
|---|
| 648 | | ptr = (unsigned char *)str; |
|---|
| 649 | | while( (c = *ptr) != '\0' ) |
|---|
| 650 | | { |
|---|
| 651 | | /* US-ASCII, 1 byte */ |
|---|
| 652 | | if( c <= 0x7F ) |
|---|
| 653 | | ptr++; /* OK */ |
|---|
| 654 | | else |
|---|
| 655 | | /* 2 bytes */ |
|---|
| 656 | | if( ( c >= 0xC2 ) && ( c <= 0xDF ) ) |
|---|
| | 646 | for (;;) |
|---|
| | 647 | { |
|---|
| | 648 | uint8_t c = ptr[0]; |
|---|
| | 649 | int charlen = -1; |
|---|
| | 650 | |
|---|
| | 651 | if (c == '\0') |
|---|
| | 652 | break; |
|---|
| | 653 | |
|---|
| | 654 | for (int i = 0; i < 7; i++) |
|---|
| | 655 | if ((c >> (7 - i)) == ((0xff >> (7 - i)) ^ 1)) |
|---|
| | 656 | { |
|---|
| | 657 | charlen = i; |
|---|
| | 658 | break; |
|---|
| | 659 | } |
|---|
| | 660 | |
|---|
| | 661 | switch (charlen) |
|---|
| 657 | 662 | { |
|---|
| 658 | | c = ptr[1]; |
|---|
| 659 | | if( isutf8cont( c ) ) |
|---|
| 660 | | ptr += 2; /* OK */ |
|---|
| 661 | | else |
|---|
| | 663 | case 0: // 7-bit ASCII character -> OK |
|---|
| | 664 | ptr++; |
|---|
| | 665 | continue; |
|---|
| | 666 | |
|---|
| | 667 | case -1: // 1111111x -> error |
|---|
| | 668 | case 1: // continuation byte -> error |
|---|
| 662 | 669 | goto error; |
|---|
| 663 | 670 | } |
|---|
| 664 | | else |
|---|
| 665 | | /* 3 bytes */ |
|---|
| 666 | | if( c == 0xE0 ) |
|---|
| | 671 | |
|---|
| | 672 | assert (charlen >= 2); |
|---|
| | 673 | |
|---|
| | 674 | uint32_t cp = c & ~((0xff >> (7 - charlen)) << (7 - charlen)); |
|---|
| | 675 | for (int i = 1; i < charlen; i++) |
|---|
| 667 | 676 | { |
|---|
| 668 | | c = ptr[1]; |
|---|
| 669 | | if( ( c >= 0xA0 ) && ( c <= 0xBF ) ) |
|---|
| 670 | | { |
|---|
| 671 | | c = ptr[2]; |
|---|
| 672 | | if( isutf8cont( c ) ) |
|---|
| 673 | | ptr += 3; /* OK */ |
|---|
| 674 | | else |
|---|
| 675 | | goto error; |
|---|
| 676 | | } |
|---|
| 677 | | else |
|---|
| | 677 | assert (cp < (1 << 26)); |
|---|
| | 678 | c = ptr[i]; |
|---|
| | 679 | |
|---|
| | 680 | if ((c == '\0') // unexpected end of string |
|---|
| | 681 | || ((c >> 6) != 2)) // not a continuation byte |
|---|
| 678 | 682 | goto error; |
|---|
| | 683 | |
|---|
| | 684 | cp = (cp << 6) | (ptr[i] & 0x3f); |
|---|
| 679 | 685 | } |
|---|
| 680 | | else |
|---|
| 681 | | if( ( ( c >= 0xE1 ) && ( c <= 0xEC ) ) || ( c == 0xEC ) |
|---|
| 682 | | || ( c == 0xEE ) || ( c == 0xEF ) ) |
|---|
| 683 | | { |
|---|
| 684 | | c = ptr[1]; |
|---|
| 685 | | if( isutf8cont( c ) ) |
|---|
| 686 | | { |
|---|
| 687 | | c = ptr[2]; |
|---|
| 688 | | if( isutf8cont( c ) ) |
|---|
| 689 | | ptr += 3; /* OK */ |
|---|
| 690 | | else |
|---|
| 691 | | goto error; |
|---|
| 692 | | } |
|---|
| 693 | | else |
|---|
| 694 | | goto error; |
|---|
| 695 | | } |
|---|
| 696 | | else |
|---|
| 697 | | if( c == 0xED ) |
|---|
| 698 | | { |
|---|
| 699 | | c = ptr[1]; |
|---|
| 700 | | if( ( c >= 0x80 ) && ( c <= 0x9F ) ) |
|---|
| 701 | | { |
|---|
| 702 | | c = ptr[2]; |
|---|
| 703 | | if( isutf8cont( c ) ) |
|---|
| 704 | | ptr += 3; /* OK */ |
|---|
| 705 | | else |
|---|
| 706 | | goto error; |
|---|
| 707 | | } |
|---|
| 708 | | else |
|---|
| 709 | | goto error; |
|---|
| 710 | | } |
|---|
| 711 | | else |
|---|
| 712 | | /* 4 bytes */ |
|---|
| 713 | | if( c == 0xF0 ) |
|---|
| 714 | | { |
|---|
| 715 | | c = ptr[1]; |
|---|
| 716 | | if( ( c >= 0x90 ) && ( c <= 0xBF ) ) |
|---|
| 717 | | { |
|---|
| 718 | | c = ptr[2]; |
|---|
| 719 | | if( isutf8cont( c ) ) |
|---|
| 720 | | { |
|---|
| 721 | | c = ptr[3]; |
|---|
| 722 | | if( isutf8cont( c ) ) |
|---|
| 723 | | ptr += 4; /* OK */ |
|---|
| 724 | | else |
|---|
| 725 | | goto error; |
|---|
| 726 | | } |
|---|
| 727 | | else |
|---|
| 728 | | goto error; |
|---|
| 729 | | } |
|---|
| 730 | | else |
|---|
| 731 | | goto error; |
|---|
| 732 | | } |
|---|
| 733 | | else |
|---|
| 734 | | if( ( c >= 0xF1 ) && ( c <= 0xF3 ) ) |
|---|
| 735 | | { |
|---|
| 736 | | c = ptr[1]; |
|---|
| 737 | | if( isutf8cont( c ) ) |
|---|
| 738 | | { |
|---|
| 739 | | c = ptr[2]; |
|---|
| 740 | | if( isutf8cont( c ) ) |
|---|
| 741 | | { |
|---|
| 742 | | c = ptr[3]; |
|---|
| 743 | | if( isutf8cont( c ) ) |
|---|
| 744 | | ptr += 4; /* OK */ |
|---|
| 745 | | goto error; |
|---|
| 746 | | } |
|---|
| 747 | | else |
|---|
| 748 | | goto error; |
|---|
| 749 | | } |
|---|
| 750 | | else |
|---|
| 751 | | goto error; |
|---|
| 752 | | } |
|---|
| 753 | | else |
|---|
| 754 | | if( c == 0xF4 ) |
|---|
| 755 | | { |
|---|
| 756 | | c = ptr[1]; |
|---|
| 757 | | if( ( c >= 0x80 ) && ( c <= 0x8F ) ) |
|---|
| 758 | | { |
|---|
| 759 | | c = ptr[2]; |
|---|
| 760 | | if( isutf8cont( c ) ) |
|---|
| 761 | | { |
|---|
| 762 | | c = ptr[3]; |
|---|
| 763 | | if( isutf8cont( c ) ) |
|---|
| 764 | | ptr += 4; /* OK */ |
|---|
| 765 | | else |
|---|
| 766 | | goto error; |
|---|
| 767 | | } |
|---|
| 768 | | else |
|---|
| 769 | | goto error; |
|---|
| 770 | | } |
|---|
| 771 | | else |
|---|
| 772 | | goto error; |
|---|
| 773 | | } |
|---|
| 774 | | else |
|---|
| | 686 | |
|---|
| | 687 | if (cp < 128) // overlong (special case for ASCII) |
|---|
| 775 | 688 | goto error; |
|---|
| 776 | | |
|---|
| | 689 | if (cp < (1u << (5 * charlen - 3))) // overlong |
|---|
| | 690 | goto error; |
|---|
| | 691 | |
|---|
| | 692 | ptr += charlen; |
|---|
| 777 | 693 | continue; |
|---|
| 778 | 694 | |
|---|
| 779 | | error: |
|---|
| 780 | | if( rep == 0 ) |
|---|
| | 695 | error: |
|---|
| | 696 | if (rep == 0) |
|---|
| 781 | 697 | return NULL; |
|---|
| 782 | | *ptr++ = '?'; |
|---|
| | 698 | *ptr++ = rep; |
|---|
| 783 | 699 | str = NULL; |
|---|
| 784 | 700 | } |
|---|