Changeset 262b177b0890b6b1943cb9c4838bcffc5152f290
- Timestamp:
- 08/10/07 20:28:49 (1 year ago)
- git-parent:
- Files:
-
- configure.ac (modified) (3 diffs)
- modules/video_chroma/Modules.am (modified) (1 diff)
- modules/video_chroma/i422_yuy2.c (modified) (10 diffs)
- modules/video_chroma/i422_yuy2.h (modified) (4 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
configure.ac
rf24d0ef r262b177 1258 1258 THREEDNOW_MODULES="memcpy3dn" 1259 1259 SSE_MODULES="" 1260 SSE2_MODULES="i420_rgb_sse2 i420_yuy2_sse2 "1260 SSE2_MODULES="i420_rgb_sse2 i420_yuy2_sse2 i422_yuy2_sse2" 1261 1261 ALTIVEC_MODULES="memcpyaltivec i420_yuy2_altivec" 1262 1262 #ALTIVEC_MODULES="${ALTIVEC_MODULES} idctaltivec motionaltivec" … … 1284 1284 if test "${ac_cv_c_mmx_intrinsics}" != "no"; then 1285 1285 AC_DEFINE(HAVE_MMX_INTRINSICS, 1, Define if MMX intrinsics are available.) 1286 VLC_ADD_CFLAGS([ i420_rgb_mmx],[-mmmx])1286 VLC_ADD_CFLAGS([${MMX_MODULES}],[-mmmx]) 1287 1287 fi 1288 1288 … … 1309 1309 if test "${ac_cv_c_sse2_intrinsics}" != "no"; then 1310 1310 AC_DEFINE(HAVE_SSE2_INTRINSICS, 1, Define if SSE2 intrinsics are available.) 1311 VLC_ADD_CFLAGS([ i420_rgb_sse2],[-msse2])1311 VLC_ADD_CFLAGS([${SSE2_MODULES}],[-msse2]) 1312 1312 fi 1313 1313 modules/video_chroma/Modules.am
r3173ecb r262b177 51 51 $(NULL) 52 52 53 SOURCES_i422_yuy2_sse2 = \ 54 i422_yuy2.c \ 55 i422_yuy2.h \ 56 $(NULL) 57 53 58 SOURCES_i420_ymga = \ 54 59 i420_ymga.c \ modules/video_chroma/i422_yuy2.c
ra388370 r262b177 68 68 set_capability( "chroma", 100 ); 69 69 add_requirement( MMX ); 70 #elif defined (MODULE_NAME_IS_i422_yuy2_sse2) 71 set_description( _("SSE2 conversions from " SRC_FOURCC " to " DEST_FOURCC) ); 72 set_capability( "chroma", 120 ); 73 add_requirement( MMX ); 70 74 #endif 71 75 set_callbacks( Activate, NULL ); … … 144 148 picture_t *p_dest ) 145 149 { 146 uint8_t *p_pixels = p_dest->p->p_pixels; 147 int i_pitch = p_dest->p->i_pitch; 150 uint8_t *p_line = p_dest->p->p_pixels; 148 151 uint8_t *p_y = p_source->Y_PIXELS; 149 152 uint8_t *p_u = p_source->U_PIXELS; … … 152 155 int i_x, i_y; 153 156 157 const int i_source_margin = p_source->p[0].i_pitch 158 - p_source->p[0].i_visible_pitch; 159 const int i_source_margin_c = p_source->p[1].i_pitch 160 - p_source->p[1].i_visible_pitch; 161 const int i_dest_margin = p_dest->p->i_pitch 162 - p_dest->p->i_visible_pitch; 163 164 #if defined (MODULE_NAME_IS_i422_yuy2_sse2) 165 166 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| 167 ((int)p_line|(int)p_y))) ) 168 { 169 /* use faster SSE2 aligned fetch and store */ 170 for( i_y = p_vout->render.i_height ; i_y-- ; ) 171 { 172 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 173 { 174 SSE2_CALL( SSE2_YUV422_YUYV_ALIGNED ); 175 } 176 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 177 { 178 C_YUV422_YUYV( p_line, p_y, p_u, p_v ); 179 } 180 p_y += i_source_margin; 181 p_u += i_source_margin_c; 182 p_v += i_source_margin_c; 183 p_line += i_dest_margin; 184 } 185 } 186 else { 187 /* use slower SSE2 unaligned fetch and store */ 188 for( i_y = p_vout->render.i_height ; i_y-- ; ) 189 { 190 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 191 { 192 SSE2_CALL( SSE2_YUV422_YUYV_UNALIGNED ); 193 } 194 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 195 { 196 C_YUV422_YUYV( p_line, p_y, p_u, p_v ); 197 } 198 p_y += i_source_margin; 199 p_u += i_source_margin_c; 200 p_v += i_source_margin_c; 201 p_line += i_dest_margin; 202 } 203 } 204 SSE2_END; 205 206 #else 207 154 208 for( i_y = p_vout->render.i_height ; i_y-- ; ) 155 209 { 156 uint8_t *p_line = p_pixels;157 210 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) 158 211 { … … 166 219 #endif 167 220 } 168 p_pixels += i_pitch; 221 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) 222 { 223 C_YUV422_YUYV( p_line, p_y, p_u, p_v ); 224 } 225 p_y += i_source_margin; 226 p_u += i_source_margin_c; 227 p_v += i_source_margin_c; 228 p_line += i_dest_margin; 169 229 } 170 230 #if defined (MODULE_NAME_IS_i422_yuy2_mmx) 171 231 MMX_END; 172 #e lif defined (MODULE_NAME_IS_i422_yuy2_sse2)173 SSE2_END; 232 #endif 233 174 234 #endif 175 235 } … … 181 241 picture_t *p_dest ) 182 242 { 183 uint8_t *p_pixels = p_dest->p->p_pixels; 184 int i_pitch = p_dest->p->i_pitch; 243 uint8_t *p_line = p_dest->p->p_pixels; 185 244 uint8_t *p_y = p_source->Y_PIXELS; 186 245 uint8_t *p_u = p_source->U_PIXELS; … … 189 248 int i_x, i_y; 190 249 250 const int i_source_margin = p_source->p[0].i_pitch 251 - p_source->p[0].i_visible_pitch; 252 const int i_source_margin_c = p_source->p[1].i_pitch 253 - p_source->p[1].i_visible_pitch; 254 const int i_dest_margin = p_dest->p->i_pitch 255 - p_dest->p->i_visible_pitch; 256 257 #if defined (MODULE_NAME_IS_i422_yuy2_sse2) 258 259 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| 260 ((int)p_line|(int)p_y))) ) 261 { 262 /* use faster SSE2 aligned fetch and store */ 263 for( i_y = p_vout->render.i_height ; i_y-- ; ) 264 { 265 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 266 { 267 SSE2_CALL( SSE2_YUV422_YVYU_ALIGNED ); 268 } 269 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 270 { 271 C_YUV422_YVYU( p_line, p_y, p_u, p_v ); 272 } 273 p_y += i_source_margin; 274 p_u += i_source_margin_c; 275 p_v += i_source_margin_c; 276 p_line += i_dest_margin; 277 } 278 } 279 else { 280 /* use slower SSE2 unaligned fetch and store */ 281 for( i_y = p_vout->render.i_height ; i_y-- ; ) 282 { 283 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 284 { 285 SSE2_CALL( SSE2_YUV422_YVYU_UNALIGNED ); 286 } 287 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 288 { 289 C_YUV422_YVYU( p_line, p_y, p_u, p_v ); 290 } 291 p_y += i_source_margin; 292 p_u += i_source_margin_c; 293 p_v += i_source_margin_c; 294 p_line += i_dest_margin; 295 } 296 } 297 SSE2_END; 298 299 #else 300 191 301 for( i_y = p_vout->render.i_height ; i_y-- ; ) 192 302 { 193 uint8_t *p_line = p_pixels;194 303 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) 195 304 { … … 203 312 #endif 204 313 } 205 p_pixels += i_pitch; 314 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) 315 { 316 C_YUV422_YVYU( p_line, p_y, p_u, p_v ); 317 } 318 p_y += i_source_margin; 319 p_u += i_source_margin_c; 320 p_v += i_source_margin_c; 321 p_line += i_dest_margin; 206 322 } 207 323 #if defined (MODULE_NAME_IS_i422_yuy2_mmx) 208 324 MMX_END; 209 #e lif defined (MODULE_NAME_IS_i422_yuy2_sse2)210 SSE2_END; 325 #endif 326 211 327 #endif 212 328 } … … 218 334 picture_t *p_dest ) 219 335 { 220 uint8_t *p_pixels = p_dest->p->p_pixels; 221 int i_pitch = p_dest->p->i_pitch; 336 uint8_t *p_line = p_dest->p->p_pixels; 222 337 uint8_t *p_y = p_source->Y_PIXELS; 223 338 uint8_t *p_u = p_source->U_PIXELS; … … 226 341 int i_x, i_y; 227 342 343 const int i_source_margin = p_source->p[0].i_pitch 344 - p_source->p[0].i_visible_pitch; 345 const int i_source_margin_c = p_source->p[1].i_pitch 346 - p_source->p[1].i_visible_pitch; 347 const int i_dest_margin = p_dest->p->i_pitch 348 - p_dest->p->i_visible_pitch; 349 350 #if defined (MODULE_NAME_IS_i422_yuy2_sse2) 351 352 if( 0 == (15 & (p_source->p[Y_PLANE].i_pitch|p_dest->p->i_pitch| 353 ((int)p_line|(int)p_y))) ) 354 { 355 /* use faster SSE2 aligned fetch and store */ 356 for( i_y = p_vout->render.i_height ; i_y-- ; ) 357 { 358 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 359 { 360 SSE2_CALL( SSE2_YUV422_UYVY_ALIGNED ); 361 } 362 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 363 { 364 C_YUV422_UYVY( p_line, p_y, p_u, p_v ); 365 } 366 p_y += i_source_margin; 367 p_u += i_source_margin_c; 368 p_v += i_source_margin_c; 369 p_line += i_dest_margin; 370 } 371 } 372 else { 373 /* use slower SSE2 unaligned fetch and store */ 374 for( i_y = p_vout->render.i_height ; i_y-- ; ) 375 { 376 for( i_x = p_vout->render.i_width / 16 ; i_x-- ; ) 377 { 378 SSE2_CALL( SSE2_YUV422_UYVY_UNALIGNED ); 379 } 380 for( i_x = ( p_vout->render.i_width % 16 ) / 2; i_x-- ; ) 381 { 382 C_YUV422_UYVY( p_line, p_y, p_u, p_v ); 383 } 384 p_y += i_source_margin; 385 p_u += i_source_margin_c; 386 p_v += i_source_margin_c; 387 p_line += i_dest_margin; 388 } 389 } 390 SSE2_END; 391 392 #else 393 228 394 for( i_y = p_vout->render.i_height ; i_y-- ; ) 229 395 { 230 uint8_t *p_line = p_pixels;231 396 for( i_x = p_vout->render.i_width / 8 ; i_x-- ; ) 232 397 { … … 240 405 #endif 241 406 } 242 p_pixels += i_pitch; 407 for( i_x = ( p_vout->render.i_width % 8 ) / 2; i_x-- ; ) 408 { 409 C_YUV422_UYVY( p_line, p_y, p_u, p_v ); 410 } 411 p_y += i_source_margin; 412 p_u += i_source_margin_c; 413 p_v += i_source_margin_c; 414 p_line += i_dest_margin; 243 415 } 244 416 #if defined (MODULE_NAME_IS_i422_yuy2_mmx) 245 417 MMX_END; 246 #e lif defined (MODULE_NAME_IS_i422_yuy2_sse2)247 SSE2_END; 418 #endif 419 248 420 #endif 249 421 } modules/video_chroma/i422_yuy2.h
ra388370 r262b177 88 88 #include <mmintrin.h> 89 89 90 #define MMX_CALL(MMX_INSTRUCTIONS) \ 91 do { \ 92 __m64 mm0, mm1, mm2; \ 93 MMX_INSTRUCTIONS \ 94 p_line += 16; p_y += 8; \ 95 p_u += 4; p_v += 4; \ 96 } while(0) 97 90 98 #define MMX_END _mm_empty() 99 100 #define MMX_YUV422_YUYV \ 101 mm0 = (__m64)*(uint64_t*)p_y; \ 102 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \ 103 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \ 104 mm1 = _mm_unpacklo_pi8(mm1, mm2); \ 105 mm2 = mm0; \ 106 mm2 = _mm_unpacklo_pi8(mm2, mm1); \ 107 *(uint64_t*)p_line = (uint64_t)mm2; \ 108 mm0 = _mm_unpackhi_pi8(mm0, mm1); \ 109 *(uint64_t*)(p_line+8) = (uint64_t)mm0; 110 111 #define MMX_YUV422_YVYU \ 112 mm0 = (__m64)*(uint64_t*)p_y; \ 113 mm2 = _mm_cvtsi32_si64(*(int*)p_u); \ 114 mm1 = _mm_cvtsi32_si64(*(int*)p_v); \ 115 mm1 = _mm_unpacklo_pi8(mm1, mm2); \ 116 mm2 = mm0; \ 117 mm2 = _mm_unpacklo_pi8(mm2, mm1); \ 118 *(uint64_t*)p_line = (uint64_t)mm2; \ 119 mm0 = _mm_unpackhi_pi8(mm0, mm1); \ 120 *(uint64_t*)(p_line+8) = (uint64_t)mm0; 121 122 #define MMX_YUV422_UYVY \ 123 mm0 = (__m64)*(uint64_t*)p_y; \ 124 mm1 = _mm_cvtsi32_si64(*(int*)p_u); \ 125 mm2 = _mm_cvtsi32_si64(*(int*)p_v); \ 126 mm1 = _mm_unpacklo_pi8(mm1, mm2); \ 127 mm2 = mm1; \ 128 mm2 = _mm_unpacklo_pi8(mm2, mm0); \ 129 *(uint64_t*)p_line = (uint64_t)mm2; \ 130 mm1 = _mm_unpackhi_pi8(mm1, mm0); \ 131 *(uint64_t*)(p_line+8) = (uint64_t)mm1; 91 132 92 133 #endif … … 98 139 /* SSE2 assembly */ 99 140 141 #define SSE2_CALL(MMX_INSTRUCTIONS) \ 142 do { \ 143 __asm__ __volatile__( \ 144 ".p2align 3 \n\t" \ 145 MMX_INSTRUCTIONS \ 146 : \ 147 : "r" (p_line), "r" (p_y), \ 148 "r" (p_u), "r" (p_v) ); \ 149 p_line += 32; p_y += 16; \ 150 p_u += 8; p_v += 8; \ 151 } while(0) 152 100 153 #define SSE2_END __asm__ __volatile__ ( "sfence" ::: "memory" ) 101 154 155 #define SSE2_YUV422_YUYV_ALIGNED " \n\ 156 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 157 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 158 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 159 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 160 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 161 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\ 162 movntdq %%xmm2, (%0) # Store low YUYV \n\ 163 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ 164 movntdq %%xmm0, 16(%0) # Store high YUYV \n\ 165 " 166 167 #define SSE2_YUV422_YUYV_UNALIGNED " \n\ 168 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 169 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 170 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 171 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\ 172 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 173 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 174 punpcklbw %%xmm1, %%xmm2 # v1 y3 u1 y2 v0 y1 u0 y0 \n\ 175 movdqu %%xmm2, (%0) # Store low YUYV \n\ 176 punpckhbw %%xmm1, %%xmm0 # v3 y7 u3 y6 v2 y5 u2 y4 \n\ 177 movdqu %%xmm0, 16(%0) # Store high YUYV \n\ 178 " 179 180 #define SSE2_YUV422_YVYU_ALIGNED " \n\ 181 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 182 movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 183 movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 184 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 185 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 186 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ 187 movntdq %%xmm2, (%0) # Store low YUYV \n\ 188 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ 189 movntdq %%xmm0, 16(%0) # Store high YUYV \n\ 190 " 191 192 #define SSE2_YUV422_YVYU_UNALIGNED " \n\ 193 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 194 movq (%2), %%xmm2 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 195 movq (%3), %%xmm1 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 196 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\ 197 punpcklbw %%xmm2, %%xmm1 # u3 v3 u2 v2 u1 v1 u0 v0 \n\ 198 movdqa %%xmm0, %%xmm2 # y7 y6 y5 y4 y3 y2 y1 y0 \n\ 199 punpcklbw %%xmm1, %%xmm2 # u1 y3 v1 y2 u0 y1 v0 y0 \n\ 200 movdqu %%xmm2, (%0) # Store low YUYV \n\ 201 punpckhbw %%xmm1, %%xmm0 # u3 y7 v3 y6 u2 y5 v2 y4 \n\ 202 movdqu %%xmm0, 16(%0) # Store high YUYV \n\ 203 " 204 205 #define SSE2_YUV422_UYVY_ALIGNED " \n\ 206 movdqa (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 207 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 208 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 209 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 210 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 211 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 212 movntdq %%xmm2, (%0) # Store low UYVY \n\ 213 punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\ 214 movntdq %%xmm1, 16(%0) # Store high UYVY \n\ 215 " 216 217 #define SSE2_YUV422_UYVY_UNALIGNED " \n\ 218 movdqu (%1), %%xmm0 # Load 8 Y y7 y6 y5 y4 y3 y2 y1 y0 \n\ 219 movq (%2), %%xmm1 # Load 4 Cb 00 00 00 00 u3 u2 u1 u0 \n\ 220 movq (%3), %%xmm2 # Load 4 Cr 00 00 00 00 v3 v2 v1 v0 \n\ 221 prefetchnta (%0) # Tell CPU not to cache output YUYV data \n\ 222 punpcklbw %%xmm2, %%xmm1 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 223 movdqa %%xmm1, %%xmm2 # v3 u3 v2 u2 v1 u1 v0 u0 \n\ 224 punpcklbw %%xmm0, %%xmm2 # y3 v1 y2 u1 y1 v0 y0 u0 \n\ 225 movdqu %%xmm2, (%0) # Store low UYVY \n\ 226 punpckhbw %%xmm0, %%xmm1 # y7 v3 y6 u3 y5 v2 y4 u2 \n\ 227 movdqu %%xmm1, 16(%0) # Store high UYVY \n\ 228 " 229 102 230 #elif defined(HAVE_SSE2_INTRINSICS) 103 231 … … 111 239 #endif 112 240 113 #e lif defined (MODULE_NAME_IS_i422_yuy2)241 #endif 114 242 115 243 #define C_YUV422_YUYV( p_line, p_y, p_u, p_v ) \ … … 137 265 *(p_line)++ = *(p_v) - 0x80; p_v += 2; \ 138 266 139 #endif 140 267
