Changeset bae04ee86f451d93afa221ae95a51d0bb424b634
- Timestamp:
- 31/05/07 11:56:25 (1 year ago)
- git-parent:
- Files:
-
- configure.ac (modified) (5 diffs)
- modules/video_chroma/Modules.am (modified) (1 diff)
- modules/video_chroma/i420_yuy2.c (modified) (17 diffs)
- modules/video_chroma/i420_yuy2.h (modified) (4 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
configure.ac
r11884b8 rbae04ee 1270 1270 THREEDNOW_MODULES="memcpy3dn" 1271 1271 SSE_MODULES="" 1272 SSE2_MODULES="" 1272 1273 ALTIVEC_MODULES="memcpyaltivec i420_yuy2_altivec" 1273 1274 #ALTIVEC_MODULES="${ALTIVEC_MODULES} idctaltivec motionaltivec" … … 1276 1277 then 1277 1278 MMX_MODULES="${MMX_MODULES} i420_yuy2_mmx" 1278 fi 1279 1280 AC_CACHE_CHECK([if \$CC groks MMX inline assembly], 1281 [ac_cv_mmx_inline], 1282 [CFLAGS="${CFLAGS_save}" 1283 AC_TRY_COMPILE(,[void *p;asm volatile("packuswb %%mm1,%%mm2"::"r"(p));], 1284 ac_cv_mmx_inline=yes, ac_cv_mmx_inline=no)]) 1285 if test "${ac_cv_mmx_inline}" != "no"; then 1286 AC_DEFINE(CAN_COMPILE_MMX, 1, Define if \$CC groks MMX inline assembly.) 1287 ACCEL_MODULES="${ACCEL_MODULES} ${MMX_MODULES}" 1279 SSE2_MODULES="${SSE2_MODULES} i420_yuy2_sse2" 1288 1280 fi 1289 1281 … … 1313 1305 fi 1314 1306 1307 dnl Check for fully workin SSE2 intrinsics 1308 dnl We need support for -mmmx, we need <emmintrin.h>, and we also need a 1309 dnl working compiler (http://gcc.gnu.org/bugzilla/show_bug.cgi?id=23963) 1310 AC_CACHE_CHECK([if \$CC groks SSE2 intrinsics], 1311 [ac_cv_c_sse2_intrinsics], 1312 [CFLAGS="${CFLAGS_save} -O -msse2" 1313 AC_TRY_COMPILE([#include <emmintrin.h> 1314 #include <stdint.h> 1315 uint64_t frobzor;], 1316 [__m128i a, b, c; 1317 a = b = c = _mm_set1_epi64((__m64)frobzor); 1318 a = _mm_slli_epi16(a, 3); 1319 a = _mm_adds_epi16(a, b); 1320 c = _mm_srli_epi16(c, 8); 1321 c = _mm_slli_epi16(c, 3); 1322 b = _mm_adds_epi16(b, c); 1323 a = _mm_unpacklo_epi8(a, b); 1324 frobzor = (uint64_t)_mm_movepi64_pi64(a);], 1325 [ac_cv_c_sse2_intrinsics=yes], 1326 [ac_cv_c_sse2_intrinsics=no])]) 1327 if test "${ac_cv_c_sse2_intrinsics}" != "no"; then 1328 AC_DEFINE(HAVE_SSE2_INTRINSICS, 1, Define if SSE2 intrinsics are available.) 1329 dnl VLC_ADD_CFLAGS([i420_rgb_sse2],[-msse2]) 1330 fi 1331 1332 AC_CACHE_CHECK([if \$CC groks MMX inline assembly], 1333 [ac_cv_mmx_inline], 1334 [CFLAGS="${CFLAGS_save}" 1335 AC_TRY_COMPILE(,[void *p;asm volatile("packuswb %%mm1,%%mm2"::"r"(p));], 1336 ac_cv_mmx_inline=yes, ac_cv_mmx_inline=no)]) 1337 if test "${ac_cv_mmx_inline}" != "no"; then 1338 AC_DEFINE(CAN_COMPILE_MMX, 1, Define if \$CC groks MMX inline assembly.) 1339 ACCEL_MODULES="${ACCEL_MODULES} ${MMX_MODULES}" 1340 fi 1341 1315 1342 AC_CACHE_CHECK([if \$CC groks MMX EXT inline assembly], 1316 1343 [ac_cv_mmxext_inline], … … 1341 1368 AC_DEFINE(CAN_COMPILE_SSE, 1, Define if \$CC groks SSE inline assembly.) 1342 1369 ACCEL_MODULES="${ACCEL_MODULES} ${SSE_MODULES}" 1370 fi 1371 1372 AC_CACHE_CHECK([if \$CC groks SSE2 inline assembly], 1373 [ac_cv_sse2_inline], 1374 [CFLAGS="${CFLAGS_save}" 1375 AC_TRY_COMPILE(,[void *p;asm volatile("punpckhqdq %%xmm1,%%xmm2"::"r"(p));], 1376 ac_cv_sse2_inline=yes, ac_cv_sse2_inline=no)]) 1377 if test "${ac_cv_sse2_inline}" != "no" -a "${SYS}" != "solaris"; then 1378 AC_DEFINE(CAN_COMPILE_SSE2, 1, Define if \$CC groks SSE2 inline assembly.) 1379 ACCEL_MODULES="${ACCEL_MODULES} ${SSE2_MODULES}" 1343 1380 fi 1344 1381 … … 1493 1530 then 1494 1531 ARCH="${ARCH} mmx" 1532 VLC_ADD_BUILTINS([${ACCEL_MODULES}]) 1533 fi 1534 if test "${host_cpu}" = "i686" -o "${host_cpu}" = "x86_64" 1535 then 1536 ARCH="${ARCH} sse sse2" 1495 1537 VLC_ADD_BUILTINS([${ACCEL_MODULES}]) 1496 1538 fi modules/video_chroma/Modules.am
rb3e689d rbae04ee 20 20 21 21 SOURCES_i420_yuy2_mmx = \ 22 i420_yuy2.c \ 23 i420_yuy2.h \ 24 $(NULL) 25 26 SOURCES_i420_yuy2_sse2 = \ 22 27 i420_yuy2.c \ 23 28 i420_yuy2.h \ modules/video_chroma/i420_yuy2.c
rd3fe7f2 rbae04ee 6 6 * 7 7 * Authors: Samuel Hocevar <sam@zoy.org> 8 * Damien Fouilleul <damien@videolan.org> 8 9 * 9 10 * This program is free software; you can redistribute it and/or modify … … 43 44 #elif defined (MODULE_NAME_IS_i420_yuy2_mmx) 44 45 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" 46 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2) 47 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422,IUYV,cyuv" 45 48 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) 46 49 # define DEST_FOURCC "YUY2,YUNV,YVYU,UYVY,UYNV,Y422" … … 64 67 65 68 #ifdef MODULE_NAME_IS_i420_yuy2_mmx 66 static uint64_t i_00ffw; 67 static uint64_t i_80w; 69 /* Initialize MMX-specific constants */ 70 static const uint64_t i_00ffw = 0x00ff00ff00ff00ffULL; 71 static const uint64_t i_80w = 0x0000000080808080ULL; 68 72 #endif 69 73 … … 79 83 set_capability( "chroma", 100 ); 80 84 add_requirement( MMX ); 81 /* Initialize MMX-specific constants */ 82 i_00ffw = 0x00ff00ff00ff00ffULL; 83 i_80w = 0x0000000080808080ULL; 85 #elif defined (MODULE_NAME_IS_i420_yuy2_sse2) 86 set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) ); 87 set_capability( "chroma", 120 ); 88 add_requirement( SSE2 ); 84 89 #elif defined (MODULE_NAME_IS_i420_yuy2_altivec) 85 90 set_description( … … 126 131 p_vout->chroma.pf_convert = I420_UYVY; 127 132 break; 128 129 133 #if !defined (MODULE_NAME_IS_i420_yuy2_altivec) 130 134 case VLC_FOURCC('I','U','Y','V'): … … 257 261 - p_dest->p->i_visible_pitch; 258 262 263 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2) 259 264 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 260 265 { … … 266 271 267 272 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx) 268 for( i_x = p_vout->render.i_width / 2 ; i_x-- ; ) 269 { 273 for( i_x = p_vout->render.i_width / 8; i_x-- ; ) 274 { 275 C_YUV420_YUYV( ); 276 C_YUV420_YUYV( ); 277 C_YUV420_YUYV( ); 270 278 C_YUV420_YUYV( ); 271 279 } … … 275 283 MMX_CALL( MMX_YUV420_YUYV ); 276 284 } 285 #endif 277 286 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) 278 287 { 279 288 C_YUV420_YUYV( ); 280 289 } 281 #endif282 290 283 291 p_y1 += i_source_margin; … … 289 297 } 290 298 299 #if defined (MODULE_NAME_IS_i420_yuy2_mmx) 300 __asm__ __volatile__("emms" :: ); 301 #endif 302 291 303 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) 292 304 } 293 305 #endif 306 307 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2) 308 /* 309 ** SSE2 128 bytes fetch/store instructions are faster 310 ** if memory access is 16 bytes aligned 311 */ 312 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| 313 ((int)p_line2|(int)p_y2))) ) 314 { 315 /* use faster SSE2 aligned fetch and store */ 316 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 317 { 318 p_line1 = p_line2; 319 p_line2 += p_dest->p->i_pitch; 320 321 p_y1 = p_y2; 322 p_y2 += p_source->p[Y_PLANE].i_pitch; 323 324 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 325 { 326 SSE2_CALL( SSE2_YUV420_YUYV_ALIGNED ); 327 } 328 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 329 { 330 C_YUV420_YUYV( ); 331 } 332 333 p_y1 += i_source_margin; 334 p_y2 += i_source_margin; 335 p_u += i_source_margin_c; 336 p_v += i_source_margin_c; 337 p_line1 += i_dest_margin; 338 p_line2 += i_dest_margin; 339 } 340 } 341 else 342 { 343 /* use slower SSE2 unaligned fetch and store */ 344 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 345 { 346 p_line1 = p_line2; 347 p_line2 += p_dest->p->i_pitch; 348 349 p_y1 = p_y2; 350 p_y2 += p_source->p[Y_PLANE].i_pitch; 351 352 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 353 { 354 SSE2_CALL( SSE2_YUV420_YUYV_UNALIGNED ); 355 } 356 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 357 { 358 C_YUV420_YUYV( ); 359 } 360 361 p_y1 += i_source_margin; 362 p_y2 += i_source_margin; 363 p_u += i_source_margin_c; 364 p_v += i_source_margin_c; 365 p_line1 += i_dest_margin; 366 p_line2 += i_dest_margin; 367 } 368 } 369 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) 294 370 } 295 371 … … 394 470 - p_dest->p->i_visible_pitch; 395 471 472 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2) 396 473 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 397 474 { … … 421 498 p_line2 += i_dest_margin; 422 499 } 500 501 #if defined (MODULE_NAME_IS_i420_yuy2_mmx) 502 __asm__ __volatile__("emms" :: ); 503 #endif 504 423 505 #if defined (MODULE_NAME_IS_i420_yuy2_altivec) 424 506 } 425 507 #endif 508 509 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2) 510 /* 511 ** SSE2 128 bytes fetch/store instructions are faster 512 ** if memory access is 16 bytes aligned 513 */ 514 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| 515 ((int)p_line2|(int)p_y2))) ) 516 { 517 /* use faster SSE2 aligned fetch and store */ 518 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 519 { 520 p_line1 = p_line2; 521 p_line2 += p_dest->p->i_pitch; 522 523 p_y1 = p_y2; 524 p_y2 += p_source->p[Y_PLANE].i_pitch; 525 526 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 527 { 528 SSE2_CALL( SSE2_YUV420_YVYU_ALIGNED ); 529 } 530 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 531 { 532 C_YUV420_YVYU( ); 533 } 534 535 p_y1 += i_source_margin; 536 p_y2 += i_source_margin; 537 p_u += i_source_margin_c; 538 p_v += i_source_margin_c; 539 p_line1 += i_dest_margin; 540 p_line2 += i_dest_margin; 541 } 542 } 543 else 544 { 545 /* use slower SSE2 unaligned fetch and store */ 546 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 547 { 548 p_line1 = p_line2; 549 p_line2 += p_dest->p->i_pitch; 550 551 p_y1 = p_y2; 552 p_y2 += p_source->p[Y_PLANE].i_pitch; 553 554 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 555 { 556 SSE2_CALL( SSE2_YUV420_YVYU_UNALIGNED ); 557 } 558 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 559 { 560 C_YUV420_YVYU( ); 561 } 562 563 p_y1 += i_source_margin; 564 p_y2 += i_source_margin; 565 p_u += i_source_margin_c; 566 p_v += i_source_margin_c; 567 p_line1 += i_dest_margin; 568 p_line2 += i_dest_margin; 569 } 570 } 571 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) 426 572 } 427 573 … … 526 672 - p_dest->p->i_visible_pitch; 527 673 674 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2) 528 675 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 529 676 { … … 565 712 } 566 713 #endif 714 715 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2) 716 /* 717 ** SSE2 128 bytes fetch/store instructions are faster 718 ** if memory access is 16 bytes aligned 719 */ 720 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| 721 ((int)p_line2|(int)p_y2))) ) 722 { 723 /* use faster SSE2 aligned fetch and store */ 724 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 725 { 726 p_line1 = p_line2; 727 p_line2 += p_dest->p->i_pitch; 728 729 p_y1 = p_y2; 730 p_y2 += p_source->p[Y_PLANE].i_pitch; 731 732 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 733 { 734 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED ); 735 } 736 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 737 { 738 C_YUV420_UYVY( ); 739 } 740 741 p_y1 += i_source_margin; 742 p_y2 += i_source_margin; 743 p_u += i_source_margin_c; 744 p_v += i_source_margin_c; 745 p_line1 += i_dest_margin; 746 p_line2 += i_dest_margin; 747 } 748 } 749 else 750 { 751 /* use slower SSE2 unaligned fetch and store */ 752 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 753 { 754 p_line1 = p_line2; 755 p_line2 += p_dest->p->i_pitch; 756 757 p_y1 = p_y2; 758 p_y2 += p_source->p[Y_PLANE].i_pitch; 759 760 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 761 { 762 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED ); 763 } 764 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 765 { 766 C_YUV420_UYVY( ); 767 } 768 769 p_y1 += i_source_margin; 770 p_y2 += i_source_margin; 771 p_u += i_source_margin_c; 772 p_v += i_source_margin_c; 773 p_line1 += i_dest_margin; 774 p_line2 += i_dest_margin; 775 } 776 } 777 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) 567 778 } 568 779 … … 602 813 - p_dest->p->i_visible_pitch; 603 814 815 #if !defined(MODULE_NAME_IS_i420_yuy2_sse2) 604 816 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 605 817 { … … 612 824 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) 613 825 { 614 #if defined (MODULE_NAME_IS_i420_yuy2)826 #if !defined (MODULE_NAME_IS_i420_yuy2_mmx) 615 827 C_YUV420_UYVY( ); 616 828 C_YUV420_UYVY( ); … … 629 841 p_line2 += i_dest_margin; 630 842 } 843 844 #if defined (MODULE_NAME_IS_i420_yuy2_mmx) 845 __asm__ __volatile__("emms" :: ); 846 #endif 847 848 #else // defined(MODULE_NAME_IS_i420_yuy2_sse2) 849 /* 850 ** SSE2 128 bytes fetch/store instructions are faster 851 ** if memory access is 16 bytes aligned 852 */ 853 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| 854 ((int)p_line2|(int)p_y2))) ) 855 { 856 /* use faster SSE2 aligned fetch and store */ 857 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 858 { 859 p_line1 = p_line2; 860 p_line2 += p_dest->p->i_pitch; 861 862 p_y1 = p_y2; 863 p_y2 += p_source->p[Y_PLANE].i_pitch; 864 865 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 866 { 867 SSE2_CALL( SSE2_YUV420_UYVY_ALIGNED ); 868 } 869 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 870 { 871 C_YUV420_UYVY( ); 872 } 873 874 p_y1 += i_source_margin; 875 p_y2 += i_source_margin; 876 p_u += i_source_margin_c; 877 p_v += i_source_margin_c; 878 p_line1 += i_dest_margin; 879 p_line2 += i_dest_margin; 880 } 881 } 882 else 883 { 884 /* use slower SSE2 unaligned fetch and store */ 885 for( i_y = p_vout->render.i_height / 2 ; i_y-- ; ) 886 { 887 p_line1 = p_line2; 888 p_line2 += p_dest->p->i_pitch; 889 890 p_y1 = p_y2; 891 p_y2 += p_source->p[Y_PLANE].i_pitch; 892 893 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 894 { 895 SSE2_CALL( SSE2_YUV420_UYVY_UNALIGNED ); 896 } 897 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 898 { 899 C_YUV420_UYVY( ); 900 } 901 902 p_y1 += i_source_margin; 903 p_y2 += i_source_margin; 904 p_u += i_source_margin_c; 905 p_v += i_source_margin_c; 906 p_line1 += i_dest_margin; 907 p_line2 += i_dest_margin; 908 } 909 } 910 #endif // defined(MODULE_NAME_IS_i420_yuy2_sse2) 631 911 } 632 912 #endif // !defined (MODULE_NAME_IS_i420_yuy2_altivec) … … 676 956 } 677 957 #endif 678 modules/video_chroma/i420_yuy2.h
ra1f2a86 rbae04ee 6 6 * 7 7 * Authors: Samuel Hocevar <sam@zoy.org> 8 * Damien Fouilleul <damien@videolan.org> 8 9 * 9 10 * This program is free software; you can redistribute it and/or modify … … 33 34 "r" (p_u), "r" (p_v) ); \ 34 35 p_line1 += 16; p_line2 += 16; p_y1 += 8; p_y2 += 8; p_u += 4; p_v += 4; \ 35 } while(0) ; \36 } while(0) 36 37 37 38 #define MMX_YUV420_YUYV " \n\ … … 112 113 " 113 114 114 #else 115 #elif defined( MODULE_NAME_IS_i420_yuy2_sse2 ) 116 117 /* SSE2 */ 118 119 #define SSE2_CALL(SSE2_INSTRUCTIONS) \ 120 do { \ 121 __asm__ __volatile__( \ 122 ".p2align 3 \n\t" \ 123 SSE2_INSTRUCTIONS \ 124 : \ 125 : "r" (p_line1), "r" (p_line2), "r" (p_y1), "r" (p_y2), \ 126 "r" (p_u), "r" (p_v) ); \ 127 p_line1 += 32; p_line2 += 32; p_y1 += 16; p_y2 += 16; \ 128 p_u += 8; p_v += 8; \ 129 } while(0) 130 131 #define SSE2_YUV420_YUYV_ALIGNED " \n\ 132 movdqa (%2), %%xmm0 # Load 16 Y y15 y14 y13 .. y2 y1 y0 \n\ 133 movq (%4), %%xmm1 # Load 8 Cb u7 u6 u5 u4 u3 u2 u1 u0 \n\ 134 movq (%5), %%xmm2 # Load 8 Cr v7 06 v5 v4 v3 v2 v1 v0 \n\ 135 punpcklbw %%xmm2, %%xmm1 # v7 u7 v6 u6 .. u1 v0 u0 \n\ 136 movdqa %%xmm0, %%xmm2 # y15 y14 y13 .. y2 y1 y0 \n\ 137 punpcklbw %%xmm1, %%xmm2 # v3 y7 u3 .. v0 y1 u0 y0 \n\ 138 movdqa %%xmm2, (%0) # Store low YUYV \n\ 139 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ 140 movdqa %%xmm0, 16(%0) # Store high YUYV \n\ 141 movdqa (%3), %%xmm0 # Load 8 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 142 movdqa %%xmm0, %%xmm2 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 143 punpcklbw %%xmm1, %%xmm2 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\ 144 movdqa %%xmm2, (%1) # Store low YUYV \n\ 145 punpckhbw %%xmm1, %%xmm0 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\ 146 movdqa %%xmm0, 16(%1) # Store high YUYV \n\ 147 " 148 149 #define SSE2_YUV420_YUYV_UNALIGNED " \n\ 150 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 151 movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 152 movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 153 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 154 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 155 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\ 156 movdqu %%xmm2, (%0) # Store low YUYV \n\ 157 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ 158 movdqu %%xmm0, 16(%0) # Store high YUYV \n\ 159 movdqu (%3), %%xmm0 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 160 movdqa %%xmm0, %%xmm2 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 161 punpcklbw %%xmm1, %%xmm2 # v1 Y3 u1 Y2 v0 Y1 u0 Y0 \n\ 162 movdqu %%xmm2, (%1) # Store low YUYV \n\ 163 punpckhbw %%xmm1, %%xmm0 # v3 Y7 u3 Y6 v2 Y5 u2 Y4 \n\ 164 movdqu %%xmm0, 16(%1) # Store high YUYV \n\ 165 " 166 167 #define SSE2_YUV420_YVYU_ALIGNED " \n\ 168 movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 169 movq (%4), %%xmm2 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 170 movq (%5), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 171 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 172 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 173 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ 174 movdqa %%xmm2, (%0) # Store low YUYV \n\ 175 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ 176 movdqa %%xmm0, 16(%0) # Store high YUYV \n\ 177 movdqa (%3), %%xmm0 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 178 movdqa %%xmm0, %%xmm2 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 179 punpcklbw %%xmm1, %%xmm2 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ 180 movdqa %%xmm2, (%1) # Store low YUYV \n\ 181 punpckhbw %%xmm1, %%xmm0 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ 182 movdqa %%xmm0, 16(%1) # Store high YUYV \n\ 183 " 184 185 #define SSE2_YUV420_YVYU_UNALIGNED " \n\ 186 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 187 movq (%4), %%xmm2 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 188 movq (%5), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 189 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 190 movdqu %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 191 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ 192 movdqu %%xmm2, (%0) # Store low YUYV \n\ 193 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ 194 movdqu %%xmm0, 16(%0) # Store high YUYV \n\ 195 movdqu (%3), %%xmm0 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 196 movdqu %%xmm0, %%xmm2 # Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 197 punpcklbw %%xmm1, %%xmm2 # u1 Y3 v1 Y2 u0 Y1 v0 Y0 \n\ 198 movdqu %%xmm2, (%1) # Store low YUYV \n\ 199 punpckhbw %%xmm1, %%xmm0 # u3 Y7 v3 Y6 u2 Y5 v2 Y4 \n\ 200 movdqu %%xmm0, 16(%1) # Store high YUYV \n\ 201 " 202 203 #define SSE2_YUV420_UYVY_ALIGNED " \n\ 204 movdqa (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 205 movdqa (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 206 movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 207 movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 208 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 209 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 210 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 211 movdqa %%xmm2, (%0) # Store low UYVY \n\ 212 movdqa %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 213 punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 214 movdqa %%xmm2, 16(%0) # Store high UYVY \n\ 215 movdqa %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 216 punpcklbw %%xmm3, %%xmm2 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ 217 movdqa %%xmm2, (%1) # Store low UYVY \n\ 218 punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ 219 movdqa %%xmm1, 16(%1) # Store high UYVY \n\ 220 " 221 222 #define SSE2_YUV420_UYVY_UNALIGNED " \n\ 223 movdqu (%2), %%xmm0 # Load 16 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 224 movdqu (%3), %%xmm3 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ 225 movq (%4), %%xmm1 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 226 movq (%5), %%xmm2 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 227 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 228 movdqu %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 229 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 230 movdqu %%xmm2, (%0) # Store low UYVY \n\ 231 movdqu %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 232 punpckhbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 233 movdqu %%xmm2, 16(%0) # Store high UYVY \n\ 234 movdqu %%xmm1, %%xmm2 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 235 punpcklbw %%xmm3, %%xmm2 # Y3 v1 Y2 u1 Y1 v0 Y0 u0 \n\ 236 movdqu %%xmm2, (%1) # Store low UYVY \n\ 237 punpckhbw %%xmm3, %%xmm1 # Y7 v3 Y6 u3 Y5 v2 Y4 u2 \n\ 238 movdqu %%xmm1, 16(%1) # Store high UYVY \n\ 239 " 240 241 #endif 242 243 /* Used in both accelerated and C modules */ 115 244 116 245 #define C_YUV420_YVYU( ) \ … … 128 257 *(p_line1)++ = *(p_line2)++ = *(p_v) - 0x80; p_v += 2; \ 129 258 130 #endif 131 132 /* Used in both MMX and C modules */ 259 133 260 #define C_YUV420_YUYV( ) \ 134 261 *(p_line1)++ = *(p_y1)++; *(p_line2)++ = *(p_y2)++; \
