Changeset 5e4dc54cffa6708234ea25496e9704f2895a6f3a
- Timestamp:
- 02/08/07 13:49:49 (1 year ago)
- git-parent:
- Files:
-
- modules/video_chroma/i420_rgb.c (modified) (1 diff)
- modules/video_chroma/i420_rgb.h (modified) (1 diff)
- modules/video_chroma/i420_rgb16.c (modified) (1 diff)
- modules/video_chroma/i420_rgb_mmx.h (modified) (7 diffs)
- modules/video_chroma/i420_yuy2.h (modified) (4 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
modules/video_chroma/i420_rgb.c
ra388370 r5e4dc54 162 162 /* R8G8B8A8 pixel format */ 163 163 msg_Dbg(p_this, "RGB pixel format is R8G8B8A8"); 164 //p_vout->chroma.pf_convert = E_(I420_B8G8R8A8); 165 return -1; 164 p_vout->chroma.pf_convert = E_(I420_R8G8B8A8); 166 165 } 167 166 else if( p_vout->output.i_rmask == 0x0000ff00 modules/video_chroma/i420_rgb.h
ra388370 r5e4dc54 65 65 void E_(I420_R5G6B5) ( vout_thread_t *, picture_t *, picture_t * ); 66 66 void E_(I420_A8R8G8B8) ( vout_thread_t *, picture_t *, picture_t * ); 67 void E_(I420_R8G8B8A8) ( vout_thread_t *, picture_t *, picture_t * ); 67 68 void E_(I420_B8G8R8A8) ( vout_thread_t *, picture_t *, picture_t * ); 68 69 void E_(I420_A8B8G8R8) ( vout_thread_t *, picture_t *, picture_t * ); modules/video_chroma/i420_rgb16.c
ra388370 r5e4dc54 1141 1141 } 1142 1142 1143 void E_(I420_R8G8B8A8)( vout_thread_t *p_vout, picture_t *p_src, 1144 picture_t *p_dest ) 1145 { 1146 /* We got this one from the old arguments */ 1147 uint32_t *p_pic = (uint32_t*)p_dest->p->p_pixels; 1148 uint8_t *p_y = p_src->Y_PIXELS; 1149 uint8_t *p_u = p_src->U_PIXELS; 1150 uint8_t *p_v = p_src->V_PIXELS; 1151 1152 vlc_bool_t b_hscale; /* horizontal scaling type */ 1153 unsigned int i_vscale; /* vertical scaling type */ 1154 unsigned int i_x, i_y; /* horizontal and vertical indexes */ 1155 1156 int i_right_margin; 1157 int i_rewind; 1158 int i_scale_count; /* scale modulo counter */ 1159 int i_chroma_width = p_vout->render.i_width / 2; /* chroma width */ 1160 uint32_t * p_pic_start; /* beginning of the current line for copy */ 1161 /* Conversion buffer pointer */ 1162 uint32_t * p_buffer_start = (uint32_t*)p_vout->chroma.p_sys->p_buffer; 1163 uint32_t * p_buffer; 1164 1165 /* Offset array pointer */ 1166 int * p_offset_start = p_vout->chroma.p_sys->p_offset; 1167 int * p_offset; 1168 1169 const int i_source_margin = p_src->p[0].i_pitch 1170 - p_src->p[0].i_visible_pitch; 1171 const int i_source_margin_c = p_src->p[1].i_pitch 1172 - p_src->p[1].i_visible_pitch; 1173 1174 i_right_margin = p_dest->p->i_pitch - p_dest->p->i_visible_pitch; 1175 1176 /* Rule: when a picture of size (x1,y1) with aspect ratio r1 is rendered 1177 * on a picture of size (x2,y2) with aspect ratio r2, if x1 grows to x1' 1178 * then y1 grows to y1' = x1' * y2/x2 * r2/r1 */ 1179 SetOffset( p_vout->render.i_width, p_vout->render.i_height, 1180 p_vout->output.i_width, p_vout->output.i_height, 1181 &b_hscale, &i_vscale, p_offset_start ); 1182 1183 /* 1184 * Perform conversion 1185 */ 1186 i_scale_count = ( i_vscale == 1 ) ? 1187 p_vout->output.i_height : p_vout->render.i_height; 1188 1189 #if defined (MODULE_NAME_IS_i420_rgb_sse2) 1190 1191 if( p_vout->render.i_width & 15 ) 1192 { 1193 i_rewind = 16 - ( p_vout->render.i_width & 15 ); 1194 } 1195 else 1196 { 1197 i_rewind = 0; 1198 } 1199 1200 /* 1201 ** SSE2 128 bits fetch/store instructions are faster 1202 ** if memory access is 16 bytes aligned 1203 */ 1204 1205 p_buffer = b_hscale ? p_buffer_start : p_pic; 1206 if( 0 == (15 & (p_src->p[Y_PLANE].i_pitch| 1207 p_dest->p->i_pitch| 1208 ((int)p_y)| 1209 ((int)p_buffer))) ) 1210 { 1211 /* use faster SSE2 aligned fetch and store */ 1212 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) 1213 { 1214 p_pic_start = p_pic; 1215 1216 for ( i_x = p_vout->render.i_width / 16; i_x--; ) 1217 { 1218 SSE2_CALL ( 1219 SSE2_INIT_32_ALIGNED 1220 SSE2_YUV_MUL 1221 SSE2_YUV_ADD 1222 SSE2_UNPACK_32_RGBA_ALIGNED 1223 ); 1224 p_y += 16; 1225 p_u += 8; 1226 p_v += 8; 1227 p_buffer += 16; 1228 } 1229 1230 /* Here we do some unaligned reads and duplicate conversions, but 1231 * at least we have all the pixels */ 1232 if( i_rewind ) 1233 { 1234 p_y -= i_rewind; 1235 p_u -= i_rewind >> 1; 1236 p_v -= i_rewind >> 1; 1237 p_buffer -= i_rewind; 1238 SSE2_CALL ( 1239 SSE2_INIT_32_UNALIGNED 1240 SSE2_YUV_MUL 1241 SSE2_YUV_ADD 1242 SSE2_UNPACK_32_RGBA_UNALIGNED 1243 ); 1244 p_y += 16; 1245 p_u += 4; 1246 p_v += 4; 1247 } 1248 SCALE_WIDTH; 1249 SCALE_HEIGHT( 420, 4 ); 1250 1251 p_y += i_source_margin; 1252 if( i_y % 2 ) 1253 { 1254 p_u += i_source_margin_c; 1255 p_v += i_source_margin_c; 1256 } 1257 p_buffer = b_hscale ? p_buffer_start : p_pic; 1258 } 1259 } 1260 else 1261 { 1262 /* use slower SSE2 unaligned fetch and store */ 1263 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) 1264 { 1265 p_pic_start = p_pic; 1266 p_buffer = b_hscale ? p_buffer_start : p_pic; 1267 1268 for ( i_x = p_vout->render.i_width / 16; i_x--; ) 1269 { 1270 SSE2_CALL ( 1271 SSE2_INIT_32_UNALIGNED 1272 SSE2_YUV_MUL 1273 SSE2_YUV_ADD 1274 SSE2_UNPACK_32_RGBA_UNALIGNED 1275 ); 1276 p_y += 16; 1277 p_u += 8; 1278 p_v += 8; 1279 p_buffer += 16; 1280 } 1281 1282 /* Here we do some unaligned reads and duplicate conversions, but 1283 * at least we have all the pixels */ 1284 if( i_rewind ) 1285 { 1286 p_y -= i_rewind; 1287 p_u -= i_rewind >> 1; 1288 p_v -= i_rewind >> 1; 1289 p_buffer -= i_rewind; 1290 SSE2_CALL ( 1291 SSE2_INIT_32_UNALIGNED 1292 SSE2_YUV_MUL 1293 SSE2_YUV_ADD 1294 SSE2_UNPACK_32_RGBA_UNALIGNED 1295 ); 1296 p_y += 16; 1297 p_u += 8; 1298 p_v += 8; 1299 } 1300 SCALE_WIDTH; 1301 SCALE_HEIGHT( 420, 4 ); 1302 1303 p_y += i_source_margin; 1304 if( i_y % 2 ) 1305 { 1306 p_u += i_source_margin_c; 1307 p_v += i_source_margin_c; 1308 } 1309 p_buffer = b_hscale ? p_buffer_start : p_pic; 1310 } 1311 } 1312 1313 /* make sure all SSE2 stores are visible thereafter */ 1314 SSE2_END; 1315 1316 #else // defined (MODULE_NAME_IS_i420_rgb_mmx) 1317 1318 if( p_vout->render.i_width & 7 ) 1319 { 1320 i_rewind = 8 - ( p_vout->render.i_width & 7 ); 1321 } 1322 else 1323 { 1324 i_rewind = 0; 1325 } 1326 1327 for( i_y = 0; i_y < p_vout->render.i_height; i_y++ ) 1328 { 1329 p_pic_start = p_pic; 1330 p_buffer = b_hscale ? p_buffer_start : p_pic; 1331 1332 for ( i_x = p_vout->render.i_width / 8; i_x--; ) 1333 { 1334 MMX_CALL ( 1335 MMX_INIT_32 1336 MMX_YUV_MUL 1337 MMX_YUV_ADD 1338 MMX_UNPACK_32_RGBA 1339 ); 1340 p_y += 8; 1341 p_u += 4; 1342 p_v += 4; 1343 p_buffer += 8; 1344 } 1345 1346 /* Here we do some unaligned reads and duplicate conversions, but 1347 * at least we have all the pixels */ 1348 if( i_rewind ) 1349 { 1350 p_y -= i_rewind; 1351 p_u -= i_rewind >> 1; 1352 p_v -= i_rewind >> 1; 1353 p_buffer -= i_rewind; 1354 MMX_CALL ( 1355 MMX_INIT_32 1356 MMX_YUV_MUL 1357 MMX_YUV_ADD 1358 MMX_UNPACK_32_RGBA 1359 ); 1360 p_y += 8; 1361 p_u += 4; 1362 p_v += 4; 1363 p_buffer += 8; 1364 } 1365 SCALE_WIDTH; 1366 SCALE_HEIGHT( 420, 4 ); 1367 1368 p_y += i_source_margin; 1369 if( i_y % 2 ) 1370 { 1371 p_u += i_source_margin_c; 1372 p_v += i_source_margin_c; 1373 } 1374 } 1375 1376 /* re-enable FPU registers */ 1377 MMX_END; 1378 1379 #endif 1380 } 1381 1143 1382 void E_(I420_B8G8R8A8)( vout_thread_t *p_vout, picture_t *p_src, 1144 1383 picture_t *p_dest ) modules/video_chroma/i420_rgb_mmx.h
ra388370 r5e4dc54 301 301 " 302 302 303 #define MMX_UNPACK_32_RGBA " \n\ 304 pxor %%mm3, %%mm3 # zero mm3 \n\ 305 movq %%mm2, %%mm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ 306 punpcklbw %%mm1, %%mm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\ 307 punpcklbw %%mm0, %%mm3 # B3 00 B2 00 B1 00 B0 00 \n\ 308 movq %%mm3, %%mm5 # R3 00 R2 00 R1 00 R0 00 \n\ 309 punpcklwd %%mm4, %%mm3 # R1 G1 B1 00 R0 G0 B0 00 \n\ 310 movq %%mm3, (%3) # Store RGBA1 RGBA0 \n\ 311 punpckhwd %%mm4, %%mm5 # R3 G3 B3 00 R2 G2 B2 00 \n\ 312 movq %%mm5, 8(%3) # Store RGBA3 RGBA2 \n\ 313 pxor %%mm6, %%mm6 # zero mm6 \n\ 314 punpckhbw %%mm1, %%mm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\ 315 punpckhbw %%mm0, %%mm6 # B7 00 B6 00 B5 00 B4 00 \n\ 316 movq %%mm6, %%mm0 # B7 00 B6 00 B5 00 B4 00 \n\ 317 punpcklwd %%mm2, %%mm6 # R5 G5 B5 00 R4 G4 B4 00 \n\ 318 movq %%mm6, 16(%3) # Store RGBA5 RGBA4 \n\ 319 punpckhwd %%mm2, %%mm0 # R7 G7 B7 00 R6 G6 B6 00 \n\ 320 movq %%mm0, 24(%3) # Store RGBA7 RGBA6 \n\ 321 " 322 303 323 #define MMX_UNPACK_32_BGRA " \n\ 304 324 pxor %%mm3, %%mm3 # zero mm3 \n\ … … 357 377 358 378 #define MMX_INIT_16 \ 359 mm0 = _mm_cvtsi32_si64( (int)*p_u);\360 mm1 = _mm_cvtsi32_si64( (int)*p_v);\379 mm0 = _mm_cvtsi32_si64(*(int*)p_u); \ 380 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \ 361 381 mm4 = _mm_setzero_si64(); \ 362 mm6 = (__m64)*(uint64_t *)p_y 382 mm6 = (__m64)*(uint64_t *)p_y; 363 383 364 384 #define MMX_INIT_32 \ 365 mm0 = _mm_cvtsi32_si64( (int)*p_u);\385 mm0 = _mm_cvtsi32_si64(*(int*)p_u); \ 366 386 *(uint16_t *)p_buffer = 0; \ 367 mm1 = _mm_cvtsi32_si64( (int)*p_v);\387 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \ 368 388 mm4 = _mm_setzero_si64(); \ 369 389 mm6 = (__m64)*(uint64_t *)p_y; … … 484 504 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0; 485 505 506 #define MMX_UNPACK_32_RGBA \ 507 mm3 = _mm_setzero_si64(); \ 508 mm4 = mm2; \ 509 mm4 = _mm_unpacklo_pi8(mm4, mm1); \ 510 mm3 = _mm_unpacklo_pi8(mm3, mm0); \ 511 mm5 = mm3; \ 512 mm3 = _mm_unpacklo_pi16(mm3, mm4); \ 513 *(uint64_t *)p_buffer = (uint64_t)mm3; \ 514 mm5 = _mm_unpackhi_pi16(mm5, mm4); \ 515 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm5;\ 516 mm6 = _mm_setzero_si64(); \ 517 mm2 = _mm_unpackhi_pi8(mm2, mm1); \ 518 mm6 = _mm_unpackhi_pi8(mm6, mm0); \ 519 mm0 = mm6; \ 520 mm6 = _mm_unpacklo_pi16(mm6, mm2); \ 521 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm6;\ 522 mm0 = _mm_unpackhi_pi16(mm0, mm2); \ 523 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm0; 524 486 525 #define MMX_UNPACK_32_BGRA \ 487 526 mm3 = _mm_setzero_si64(); \ … … 504 543 505 544 #define MMX_UNPACK_32_ABGR \ 506 ; 545 mm3 = _mm_setzero_si64(); \ 546 mm4 = mm1; \ 547 mm4 = _mm_unpacklo_pi8(mm4, mm2); \ 548 mm5 = mm0; \ 549 mm5 = _mm_unpacklo_pi8(mm5, mm3); \ 550 mm6 = mm4; \ 551 mm4 = _mm_unpacklo_pi16(mm4, mm5); \ 552 *(uint64_t *)p_buffer = (uint64_t)mm4; \ 553 mm6 = _mm_unpackhi_pi16(mm6, mm5); \ 554 *(uint64_t *)(p_buffer + 2) = (uint64_t)mm6;\ 555 mm1 = _mm_unpackhi_pi8(mm1, mm2); \ 556 mm0 = _mm_unpackhi_pi8(mm0, mm3); \ 557 mm2 = mm1; \ 558 mm1 = _mm_unpacklo_pi16(mm1, mm0); \ 559 *(uint64_t *)(p_buffer + 4) = (uint64_t)mm1;\ 560 mm2 = _mm_unpackhi_pi16(mm2, mm0); \ 561 *(uint64_t *)(p_buffer + 6) = (uint64_t)mm2; 507 562 508 563 #endif … … 796 851 " 797 852 853 #define SSE2_UNPACK_32_RGBA_ALIGNED " \n\ 854 pxor %%xmm3, %%xmm3 # zero mm3 \n\ 855 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ 856 punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\ 857 punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\ 858 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\ 859 punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\ 860 movntdq %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\ 861 punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\ 862 movntdq %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\ 863 pxor %%xmm6, %%xmm6 # zero mm6 \n\ 864 punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\ 865 punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\ 866 movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\ 867 punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\ 868 movntdq %%xmm6, 32(%3) # Store BGRA11 BGRA10 BGRA9 RGBA8 \n\ 869 punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\ 870 movntdq %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\ 871 " 872 873 #define SSE2_UNPACK_32_RGBA_UNALIGNED " \n\ 874 pxor %%xmm3, %%xmm3 # zero mm3 \n\ 875 movdqa %%xmm2, %%xmm4 # G7 G6 G5 G4 G3 G2 G1 G0 \n\ 876 punpcklbw %%xmm1, %%xmm4 # R3 G3 R2 G2 R1 G1 R0 G0 \n\ 877 punpcklbw %%xmm0, %%xmm3 # B3 00 B2 00 B1 00 B0 00 \n\ 878 movdqa %%xmm3, %%xmm5 # R3 00 R2 00 R1 00 R0 00 \n\ 879 punpcklwd %%xmm4, %%xmm3 # R1 G1 B1 00 R0 B0 G0 00 \n\ 880 movdqu %%xmm3, (%3) # Store RGBA3 RGBA2 RGBA1 RGBA0 \n\ 881 punpckhwd %%xmm4, %%xmm5 # R3 G3 B3 00 R2 G2 B2 00 \n\ 882 movdqu %%xmm5, 16(%3) # Store RGBA7 RGBA6 RGBA5 RGBA4 \n\ 883 pxor %%xmm6, %%xmm6 # zero mm6 \n\ 884 punpckhbw %%xmm1, %%xmm2 # R7 G7 R6 G6 R5 G5 R4 G4 \n\ 885 punpckhbw %%xmm0, %%xmm6 # B7 00 B6 00 B5 00 B4 00 \n\ 886 movdqa %%xmm6, %%xmm0 # B7 00 B6 00 B5 00 B4 00 \n\ 887 punpcklwd %%xmm2, %%xmm6 # R5 G5 B5 00 R4 G4 B4 00 \n\ 888 movdqu %%xmm6, 32(%3) # Store RGBA11 RGBA10 RGBA9 RGBA8 \n\ 889 punpckhwd %%xmm2, %%xmm0 # R7 G7 B7 00 R6 G6 B6 00 \n\ 890 movdqu %%xmm0, 48(%3) # Store RGBA15 RGBA14 RGBA13 RGBA12 \n\ 891 " 892 798 893 #define SSE2_UNPACK_32_BGRA_ALIGNED " \n\ 799 894 pxor %%xmm3, %%xmm3 # zero mm3 \n\ … … 882 977 #include <emmintrin.h> 883 978 884 #define SSE2_CALL(SSE2_INSTRUCTIONS) \885 do { \886 __m128i xmm0, xmm1, xmm2, xmm3, \887 xmm4, xmm5, xmm6, xmm7; \888 SSE2_INSTRUCTIONS \979 #define SSE2_CALL(SSE2_INSTRUCTIONS) \ 980 do { \ 981 __m128i xmm0, xmm1, xmm2, xmm3, \ 982 xmm4, xmm5, xmm6, xmm7; \ 983 SSE2_INSTRUCTIONS \ 889 984 } while(0) 890 985 … … 972 1067 xmm2 = _mm_unpacklo_epi8(xmm2, xmm5); 973 1068 974 #define SSE2_UNPACK_15_ALIGNED \975 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \976 xmm0 = _mm_and_si128(xmm0, xmm5); \977 xmm0 = _mm_srli_epi16(xmm0, 3); \978 xmm2 = _mm_and_si128(xmm2, xmm5); \979 xmm1 = _mm_and_si128(xmm1, xmm5); \980 xmm1 = _mm_srli_epi16(xmm1, 1); \981 xmm4 = _mm_setzero_si128(); \982 xmm5 = xmm0; \983 xmm7 = xmm2; \984 \ 985 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \986 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \987 xmm2 = _mm_slli_epi16(xmm2, 2); \988 xmm0 = _mm_or_si128(xmm0, xmm2); \989 _mm_stream_si128((__m128i*)p_buffer, xmm0); \990 \ 991 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \992 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \993 xmm7 = _mm_slli_epi16(xmm7, 2); \994 xmm5 = _mm_or_si128(xmm5, xmm7); \1069 #define SSE2_UNPACK_15_ALIGNED \ 1070 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ 1071 xmm0 = _mm_and_si128(xmm0, xmm5); \ 1072 xmm0 = _mm_srli_epi16(xmm0, 3); \ 1073 xmm2 = _mm_and_si128(xmm2, xmm5); \ 1074 xmm1 = _mm_and_si128(xmm1, xmm5); \ 1075 xmm1 = _mm_srli_epi16(xmm1, 1); \ 1076 xmm4 = _mm_setzero_si128(); \ 1077 xmm5 = xmm0; \ 1078 xmm7 = xmm2; \ 1079 \ 1080 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ 1081 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ 1082 xmm2 = _mm_slli_epi16(xmm2, 2); \ 1083 xmm0 = _mm_or_si128(xmm0, xmm2); \ 1084 _mm_stream_si128((__m128i*)p_buffer, xmm0); \ 1085 \ 1086 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ 1087 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ 1088 xmm7 = _mm_slli_epi16(xmm7, 2); \ 1089 xmm5 = _mm_or_si128(xmm5, xmm7); \ 995 1090 _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); 996 1091 997 #define SSE2_UNPACK_15_UNALIGNED \998 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \999 xmm0 = _mm_and_si128(xmm0, xmm5); \1000 xmm0 = _mm_srli_epi16(xmm0, 3); \1001 xmm2 = _mm_and_si128(xmm2, xmm5); \1002 xmm1 = _mm_and_si128(xmm1, xmm5); \1003 xmm1 = _mm_srli_epi16(xmm1, 1); \1004 xmm4 = _mm_setzero_si128(); \1005 xmm5 = xmm0; \1006 xmm7 = xmm2; \1007 \ 1008 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \1009 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \1010 xmm2 = _mm_slli_epi16(xmm2, 2); \1011 xmm0 = _mm_or_si128(xmm0, xmm2); \1012 _mm_storeu_si128((__m128i*)p_buffer, xmm0); \1013 \ 1014 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \1015 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \1016 xmm7 = _mm_slli_epi16(xmm7, 2); \1017 xmm5 = _mm_or_si128(xmm5, xmm7); \1092 #define SSE2_UNPACK_15_UNALIGNED \ 1093 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ 1094 xmm0 = _mm_and_si128(xmm0, xmm5); \ 1095 xmm0 = _mm_srli_epi16(xmm0, 3); \ 1096 xmm2 = _mm_and_si128(xmm2, xmm5); \ 1097 xmm1 = _mm_and_si128(xmm1, xmm5); \ 1098 xmm1 = _mm_srli_epi16(xmm1, 1); \ 1099 xmm4 = _mm_setzero_si128(); \ 1100 xmm5 = xmm0; \ 1101 xmm7 = xmm2; \ 1102 \ 1103 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ 1104 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ 1105 xmm2 = _mm_slli_epi16(xmm2, 2); \ 1106 xmm0 = _mm_or_si128(xmm0, xmm2); \ 1107 _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ 1108 \ 1109 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ 1110 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ 1111 xmm7 = _mm_slli_epi16(xmm7, 2); \ 1112 xmm5 = _mm_or_si128(xmm5, xmm7); \ 1018 1113 _mm_storeu_si128((__m128i*)(p_buffer+16), xmm5); 1019 1114 1020 #define SSE2_UNPACK_16_ALIGNED \1021 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \1022 xmm0 = _mm_and_si128(xmm0, xmm5); \1023 xmm1 = _mm_and_si128(xmm1, xmm5); \1024 xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \1025 xmm2 = _mm_and_si128(xmm2, xmm5); \1026 xmm0 = _mm_srli_epi16(xmm0, 3); \1027 xmm4 = _mm_setzero_si128(); \1028 xmm5 = xmm0; \1029 xmm7 = xmm2; \1030 \ 1031 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \1032 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \1033 xmm2 = _mm_slli_epi16(xmm2, 3); \1034 xmm0 = _mm_or_si128(xmm0, xmm2); \1035 _mm_stream_si128((__m128i*)p_buffer, xmm0); \1036 \ 1037 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \1038 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \1039 xmm7 = _mm_slli_epi16(xmm7, 3); \1040 xmm5 = _mm_or_si128(xmm5, xmm7); \1115 #define SSE2_UNPACK_16_ALIGNED \ 1116 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ 1117 xmm0 = _mm_and_si128(xmm0, xmm5); \ 1118 xmm1 = _mm_and_si128(xmm1, xmm5); \ 1119 xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ 1120 xmm2 = _mm_and_si128(xmm2, xmm5); \ 1121 xmm0 = _mm_srli_epi16(xmm0, 3); \ 1122 xmm4 = _mm_setzero_si128(); \ 1123 xmm5 = xmm0; \ 1124 xmm7 = xmm2; \ 1125 \ 1126 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ 1127 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ 1128 xmm2 = _mm_slli_epi16(xmm2, 3); \ 1129 xmm0 = _mm_or_si128(xmm0, xmm2); \ 1130 _mm_stream_si128((__m128i*)p_buffer, xmm0); \ 1131 \ 1132 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ 1133 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ 1134 xmm7 = _mm_slli_epi16(xmm7, 3); \ 1135 xmm5 = _mm_or_si128(xmm5, xmm7); \ 1041 1136 _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); 1042 1137 1043 #define SSE2_UNPACK_16_UNALIGNED \1044 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \1045 xmm0 = _mm_and_si128(xmm0, xmm5); \1046 xmm1 = _mm_and_si128(xmm1, xmm5); \1047 xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \1048 xmm2 = _mm_and_si128(xmm2, xmm5); \1049 xmm0 = _mm_srli_epi16(xmm0, 3); \1050 xmm4 = _mm_setzero_si128(); \1051 xmm5 = xmm0; \1052 xmm7 = xmm2; \1053 \ 1054 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \1055 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \1056 xmm2 = _mm_slli_epi16(xmm2, 3); \1057 xmm0 = _mm_or_si128(xmm0, xmm2); \1058 _mm_storeu_si128((__m128i*)p_buffer, xmm0); \1059 \ 1060 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \1061 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \1062 xmm7 = _mm_slli_epi16(xmm7, 3); \1063 xmm5 = _mm_or_si128(xmm5, xmm7); \1138 #define SSE2_UNPACK_16_UNALIGNED \ 1139 xmm5 = _mm_set1_epi32(0xf8f8f8f8UL); \ 1140 xmm0 = _mm_and_si128(xmm0, xmm5); \ 1141 xmm1 = _mm_and_si128(xmm1, xmm5); \ 1142 xmm5 = _mm_set1_epi32(0xfcfcfcfcUL); \ 1143 xmm2 = _mm_and_si128(xmm2, xmm5); \ 1144 xmm0 = _mm_srli_epi16(xmm0, 3); \ 1145 xmm4 = _mm_setzero_si128(); \ 1146 xmm5 = xmm0; \ 1147 xmm7 = xmm2; \ 1148 \ 1149 xmm2 = _mm_unpacklo_epi8(xmm2, xmm4); \ 1150 xmm0 = _mm_unpacklo_epi8(xmm0, xmm1); \ 1151 xmm2 = _mm_slli_epi16(xmm2, 3); \ 1152 xmm0 = _mm_or_si128(xmm0, xmm2); \ 1153 _mm_storeu_si128((__m128i*)p_buffer, xmm0); \ 1154 \ 1155 xmm7 = _mm_unpackhi_epi8(xmm7, xmm4); \ 1156 xmm5 = _mm_unpackhi_epi8(xmm5, xmm1); \ 1157 xmm7 = _mm_slli_epi16(xmm7, 3); \ 1158 xmm5 = _mm_or_si128(xmm5, xmm7); \ 1064 1159 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); 1065 1160 1066 #define SSE2_UNPACK_32_ARGB_ALIGNED \1067 xmm3 = _mm_setzero_si128(); \1068 xmm4 = xmm0; \1069 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \1070 xmm5 = xmm1; \1071 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \1072 xmm6 = xmm4; \1073 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \1074 _mm_stream_si128((__m128i*)(p_buffer), xmm4); \1075 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \1161 #define SSE2_UNPACK_32_ARGB_ALIGNED \ 1162 xmm3 = _mm_setzero_si128(); \ 1163 xmm4 = xmm0; \ 1164 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ 1165 xmm5 = xmm1; \ 1166 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ 1167 xmm6 = xmm4; \ 1168 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ 1169 _mm_stream_si128((__m128i*)(p_buffer), xmm4); \ 1170 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ 1076 1171 _mm_stream_si128((__m128i*)(p_buffer+4), xmm6); \ 1077 xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \1078 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \1079 xmm5 = xmm0; \1080 xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \1172 xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ 1173 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ 1174 xmm5 = xmm0; \ 1175 xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ 1081 1176 _mm_stream_si128((__m128i*)(p_buffer+8), xmm5); \ 1082 xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \1177 xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ 1083 1178 _mm_stream_si128((__m128i*)(p_buffer+12), xmm0); 1084 1179 1085 #define SSE2_UNPACK_32_ARGB_UNALIGNED \1086 xmm3 = _mm_setzero_si128(); \1087 xmm4 = xmm0; \1088 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \1089 xmm5 = xmm1; \1090 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \1091 xmm6 = xmm4; \1092 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \1093 _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \1094 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \1180 #define SSE2_UNPACK_32_ARGB_UNALIGNED \ 1181 xmm3 = _mm_setzero_si128(); \ 1182 xmm4 = xmm0; \ 1183 xmm4 = _mm_unpacklo_epi8(xmm4, xmm2); \ 1184 xmm5 = xmm1; \ 1185 xmm5 = _mm_unpacklo_epi8(xmm5, xmm3); \ 1186 xmm6 = xmm4; \ 1187 xmm4 = _mm_unpacklo_epi16(xmm4, xmm5); \ 1188 _mm_storeu_si128((__m128i*)(p_buffer), xmm4); \ 1189 xmm6 = _mm_unpackhi_epi16(xmm6, xmm5); \ 1095 1190 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm6); \ 1096 xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \1097 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \1098 xmm5 = xmm0; \1099 xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \1191 xmm0 = _mm_unpackhi_epi8(xmm0, xmm2); \ 1192 xmm1 = _mm_unpackhi_epi8(xmm1, xmm3); \ 1193 xmm5 = xmm0; \ 1194 xmm5 = _mm_unpacklo_epi16(xmm5, xmm1); \ 1100 1195 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm5); \ 1101 xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \1196 xmm0 = _mm_unpackhi_epi16(xmm0, xmm1); \ 1102 1197 _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0); 1103 1198 1104 #define SSE2_UNPACK_32_ BGRA_ALIGNED\1105 xmm3 = _mm_setzero_si128(); \1106 xmm4 = xmm2; \1107 xmm4 = _mm_unpacklo_epi8(xmm4, xmm 0);\1108 xmm3 = _mm_unpacklo_epi8(xmm3, xmm 1);\1109 xmm5 = xmm3; \1110 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \1111 _mm_stream_si128((__m128i*)(p_buffer), xmm3); \1112 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \1199 #define SSE2_UNPACK_32_RGBA_ALIGNED \ 1200 xmm3 = _mm_setzero_si128(); \ 1201 xmm4 = xmm2; \ 1202 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ 1203 xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); \ 1204 xmm5 = xmm3; \ 1205 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ 1206 _mm_stream_si128((__m128i*)(p_buffer), xmm3); \ 1207 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ 1113 1208 _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \ 1114 xmm6 = _mm_setzero_si128(); \1115 xmm2 = _mm_unpackhi_epi8(xmm2, xmm 0);\1116 xmm6 = _mm_unpackhi_epi8(xmm6, xmm 1);\1117 xmm0 = xmm6; \1118 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \1209 xmm6 = _mm_setzero_si128(); \ 1210 xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); \ 1211 xmm6 = _mm_unpackhi_epi8(xmm6, xmm0); \ 1212 xmm0 = xmm6; \ 1213 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ 1119 1214 _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \ 1120 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \1215 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ 1121 1216 _mm_stream_si128((__m128i*)(p_buffer+12), xmm0); 1122 1217 1123 #define SSE2_UNPACK_32_ BGRA_UNALIGNED\1124 xmm3 = _mm_setzero_si128(); \1125 xmm4 = xmm2; \1126 xmm4 = _mm_unpacklo_epi8(xmm4, xmm 0);\1127 xmm3 = _mm_unpacklo_epi8(xmm3, xmm 1);\1128 xmm5 = xmm3; \1129 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \1130 _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \1131 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \1218 #define SSE2_UNPACK_32_RGBA_UNALIGNED \ 1219 xmm3 = _mm_setzero_si128(); \ 1220 xmm4 = xmm2; \ 1221 xmm4 = _mm_unpacklo_epi8(xmm4, xmm1); \ 1222 xmm3 = _mm_unpacklo_epi8(xmm3, xmm0); \ 1223 xmm5 = xmm3; \ 1224 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ 1225 _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \ 1226 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ 1132 1227 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \ 1133 xmm6 = _mm_setzero_si128(); \1134 xmm2 = _mm_unpackhi_epi8(xmm2, xmm 0);\1135 xmm6 = _mm_unpackhi_epi8(xmm6, xmm 1);\1136 xmm0 = xmm6; \1137 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \1228 xmm6 = _mm_setzero_si128(); \ 1229 xmm2 = _mm_unpackhi_epi8(xmm2, xmm1); \ 1230 xmm6 = _mm_unpackhi_epi8(xmm6, xmm0); \ 1231 xmm0 = xmm6; \ 1232 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ 1138 1233 _mm_storeu_si128((__m128i*)(p_buffer+8), xmm6); \ 1139 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \1234 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ 1140 1235 _mm_storeu_si128((__m128i*)(p_buffer+12), xmm0); 1141 1236 1142 #define SSE2_UNPACK_32_ABGR_ALIGNED \ 1143 ; 1144 1145 #define SSE2_UNPACK_32_ABGR_UNALIGNED \ 1146 ; 1237 #define SSE2_UNPACK_32_BGRA_ALIGNED \ 1238 xmm3 = _mm_setzero_si128(); \ 1239 xmm4 = xmm2; \ 1240 xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ 1241 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ 1242 xmm5 = xmm3; \ 1243 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ 1244 _mm_stream_si128((__m128i*)(p_buffer), xmm3); \ 1245 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ 1246 _mm_stream_si128((__m128i*)(p_buffer+4), xmm5); \ 1247 xmm6 = _mm_setzero_si128(); \ 1248 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \ 1249 xmm6 = _mm_unpackhi_epi8(xmm6, xmm1); \ 1250 xmm0 = xmm6; \ 1251 xmm6 = _mm_unpacklo_epi16(xmm6, xmm2); \ 1252 _mm_stream_si128((__m128i*)(p_buffer+8), xmm6); \ 1253 xmm0 = _mm_unpackhi_epi16(xmm0, xmm2); \ 1254 _mm_stream_si128((__m128i*)(p_buffer+12), xmm0); 1255 1256 #define SSE2_UNPACK_32_BGRA_UNALIGNED \ 1257 xmm3 = _mm_setzero_si128(); \ 1258 xmm4 = xmm2; \ 1259 xmm4 = _mm_unpacklo_epi8(xmm4, xmm0); \ 1260 xmm3 = _mm_unpacklo_epi8(xmm3, xmm1); \ 1261 xmm5 = xmm3; \ 1262 xmm3 = _mm_unpacklo_epi16(xmm3, xmm4); \ 1263 _mm_storeu_si128((__m128i*)(p_buffer), xmm3); \ 1264 xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ 1265 _mm_storeu_si128((__m128i*)(p_buffer+4), xmm5); \ 1266 xmm6 = _mm_setzero_si128(); \ 1267 xmm2 = _mm_unpackhi_epi8(xmm2, xmm0); \
