Changeset f4f90e674b23ba5a949d0bffd942451685d31907
- Timestamp:
- 17/06/07 00:13:47
(1 year ago)
- Author:
- Damien Fouilleul <damienf@videolan.org>
- git-committer:
- Damien Fouilleul <damienf@videolan.org> 1182032027 +0000
- git-parent:
[a3eb2a7047a551239dfe1b6dd9cd59dee6718313]
- git-author:
- Damien Fouilleul <damienf@videolan.org> 1182032027 +0000
- Message:
video_chroma: a few SSE2 fixes
-
Files:
-
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
| r9acaa4b |
rf4f90e6 |
|
| 449 | 449 | p_buffer = b_hscale ? p_buffer_start : p_pic; |
|---|
| 450 | 450 | } |
|---|
| 451 | | /* make sure all SSE2 stores are visible thereafter */ |
|---|
| 452 | | #if defined (CAN_COMPILE_SSE2) |
|---|
| 453 | | __asm__ __volatile__ ( "sfence" ); |
|---|
| 454 | | #else |
|---|
| 455 | | _mm_sfence(); |
|---|
| 456 | | #endif |
|---|
| 457 | 451 | } |
|---|
| 458 | 452 | else |
|---|
| … | … | |
| 527 | 521 | } |
|---|
| 528 | 522 | } |
|---|
| | 523 | |
|---|
| | 524 | /* make sure all SSE2 stores are visible thereafter */ |
|---|
| | 525 | #if defined (CAN_COMPILE_SSE2) |
|---|
| | 526 | __asm__ __volatile__ ( "sfence" ::: "memory" ); |
|---|
| | 527 | #else |
|---|
| | 528 | _mm_sfence(); |
|---|
| | 529 | #endif |
|---|
| | 530 | |
|---|
| 529 | 531 | #else // defined (MODULE_NAME_IS_i420_rgb_mmx) |
|---|
| 530 | 532 | |
|---|
| … | … | |
| 756 | 758 | p_buffer = b_hscale ? p_buffer_start : p_pic; |
|---|
| 757 | 759 | } |
|---|
| 758 | | /* make sure all SSE2 stores are visible thereafter */ |
|---|
| 759 | | #if defined (CAN_COMPILE_SSE2) |
|---|
| 760 | | __asm__ __volatile__ ( "sfence" ); |
|---|
| 761 | | #else |
|---|
| 762 | | _mm_sfence(); |
|---|
| 763 | | #endif |
|---|
| 764 | 760 | } |
|---|
| 765 | 761 | else |
|---|
| … | … | |
| 834 | 830 | } |
|---|
| 835 | 831 | } |
|---|
| | 832 | |
|---|
| | 833 | /* make sure all SSE2 stores are visible thereafter */ |
|---|
| | 834 | #if defined (CAN_COMPILE_SSE2) |
|---|
| | 835 | __asm__ __volatile__ ( "sfence" ::: "memory" ); |
|---|
| | 836 | #else |
|---|
| | 837 | _mm_sfence(); |
|---|
| | 838 | #endif |
|---|
| | 839 | |
|---|
| 836 | 840 | #else // defined (MODULE_NAME_IS_i420_rgb_mmx) |
|---|
| 837 | 841 | |
|---|
| … | … | |
| 1180 | 1184 | p_buffer = b_hscale ? p_buffer_start : p_pic; |
|---|
| 1181 | 1185 | } |
|---|
| 1182 | | /* make sure all SSE2 stores are visible thereafter */ |
|---|
| 1183 | | #if defined (CAN_COMPILE_SSE2) |
|---|
| 1184 | | __asm__ __volatile__ ( "sfence" ); |
|---|
| 1185 | | #else |
|---|
| 1186 | | _mm_sfence(); |
|---|
| 1187 | | #endif |
|---|
| 1188 | 1186 | } |
|---|
| 1189 | 1187 | else |
|---|
| … | … | |
| 1264 | 1262 | } |
|---|
| 1265 | 1263 | |
|---|
| 1266 | | #else |
|---|
| | 1264 | /* make sure all SSE2 stores are visible thereafter */ |
|---|
| | 1265 | #if defined (CAN_COMPILE_SSE2) |
|---|
| | 1266 | __asm__ __volatile__ ( "sfence" ::: "memory" ); |
|---|
| | 1267 | #else |
|---|
| | 1268 | _mm_sfence(); |
|---|
| | 1269 | #endif |
|---|
| | 1270 | |
|---|
| | 1271 | #else // defined (MODULE_NAME_IS_i420_rgb_mmx) |
|---|
| 1267 | 1272 | |
|---|
| 1268 | 1273 | if( p_vout->render.i_width & 7 ) |
|---|
| … | … | |
| 1501 | 1506 | p_buffer = b_hscale ? p_buffer_start : p_pic; |
|---|
| 1502 | 1507 | } |
|---|
| 1503 | | /* make sure all SSE2 stores are visible thereafter */ |
|---|
| 1504 | | #if defined (CAN_COMPILE_SSE2) |
|---|
| 1505 | | __asm__ __volatile__ ( "sfence" ); |
|---|
| 1506 | | #else |
|---|
| 1507 | | _mm_sfence(); |
|---|
| 1508 | | #endif |
|---|
| 1509 | 1508 | } |
|---|
| 1510 | 1509 | else |
|---|
| r9acaa4b |
rf4f90e6 |
|
| 62 | 62 | |
|---|
| 63 | 63 | #define SSE2_INIT_16_ALIGNED " \n\ |
|---|
| 64 | | prefetcht1 (%3) # cache preload for image \n\ |
|---|
| 65 | 64 | movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ |
|---|
| 66 | 65 | movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ |
|---|
| … | … | |
| 70 | 69 | |
|---|
| 71 | 70 | #define SSE2_INIT_16_UNALIGNED " \n\ |
|---|
| 72 | | prefetcht1 (%3) # cache preload for image \n\ |
|---|
| 73 | 71 | movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ |
|---|
| 74 | 72 | movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ |
|---|
| 75 | 73 | pxor %%xmm4, %%xmm4 # zero mm4 \n\ |
|---|
| 76 | 74 | movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ |
|---|
| | 75 | prefetchnta (%3) # Tell CPU not to cache output RGB data \n\ |
|---|
| 77 | 76 | " |
|---|
| 78 | 77 | |
|---|
| … | … | |
| 92 | 91 | |
|---|
| 93 | 92 | #define SSE2_INTRINSICS_INIT_16_UNALIGNED \ |
|---|
| 94 | | _mm_prefetch(p_buffer, _MM_HINT_T1); \ |
|---|
| 95 | 93 | xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ |
|---|
| 96 | 94 | xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ |
|---|
| 97 | 95 | xmm4 = _mm_setzero_si128(); \ |
|---|
| 98 | 96 | xmm6 = _mm_loadu_si128((__m128i *)p_y); \ |
|---|
| | 97 | _mm_prefetch(p_buffer, _MM_HINT_NTA); \ |
|---|
| 99 | 98 | |
|---|
| 100 | 99 | #define MMX_INIT_16_GRAY " \n\ |
|---|
| … | … | |
| 119 | 118 | |
|---|
| 120 | 119 | #define SSE2_INIT_32_UNALIGNED " \n\ |
|---|
| 121 | | prefetcht1 (%3) # cache preload for image \n\ |
|---|
| 122 | 120 | movq (%1), %%xmm0 # Load 8 Cb 00 00 00 00 u3 u2 u1 u0 \n\ |
|---|
| 123 | 121 | movq (%2), %%xmm1 # Load 8 Cr 00 00 00 00 v3 v2 v1 v0 \n\ |
|---|
| 124 | 122 | pxor %%xmm4, %%xmm4 # zero mm4 \n\ |
|---|
| 125 | 123 | movdqu (%0), %%xmm6 # Load 16 Y Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\ |
|---|
| | 124 | prefetchnta (%3) # Tell CPU not to cache output RGB data \n\ |
|---|
| 126 | 125 | " |
|---|
| 127 | 126 | |
|---|
| … | … | |
| 142 | 141 | |
|---|
| 143 | 142 | #define SSE2_INTRINSICS_INIT_32_UNALIGNED \ |
|---|
| 144 | | _mm_prefetch(p_buffer, _MM_HINT_T1); \ |
|---|
| 145 | 143 | xmm0 = _mm_loadl_epi64((__m128i *)p_u); \ |
|---|
| 146 | 144 | xmm1 = _mm_loadl_epi64((__m128i *)p_v); \ |
|---|
| 147 | 145 | xmm4 = _mm_setzero_si128(); \ |
|---|
| 148 | 146 | xmm6 = _mm_loadu_si128((__m128i *)p_y); \ |
|---|
| | 147 | _mm_prefetch(p_buffer, _MM_HINT_NTA); \ |
|---|
| 149 | 148 | |
|---|
| 150 | 149 | /* |
|---|
| … | … | |
| 261 | 260 | xmm0 = _mm_unpacklo_epi8(xmm0, xmm4); \ |
|---|
| 262 | 261 | xmm1 = _mm_unpacklo_epi8(xmm1, xmm4); \ |
|---|
| 263 | | xmm5 = _mm_set1_epi32(0x80808080UL); \ |
|---|
| | 262 | xmm5 = _mm_set1_epi32(0x00800080UL); \ |
|---|
| 264 | 263 | xmm0 = _mm_subs_epi16(xmm0, xmm5); \ |
|---|
| 265 | 264 | xmm1 = _mm_subs_epi16(xmm1, xmm5); \ |
|---|
| … | … | |
| 1002 | 1001 | xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \ |
|---|
| 1003 | 1002 | _mm_stream_si128((__m128i*)(p_buffer+8), xmm3); \ |
|---|
| 1004 | | xmm5 = _xmm_unpackhi_pi16(xmm5, xmm4); \ |
|---|
| | 1003 | xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ |
|---|
| 1005 | 1004 | _mm_stream_si128((__m128i*)(p_buffer+12), xmm5); \ |
|---|
| 1006 | 1005 | |
|---|
| … | … | |
| 1022 | 1021 | xmm3 = _mm_unpacklo_epi16(xmm3, xmm1); \ |
|---|
| 1023 | 1022 | _mm_storeu_si128((__m128i*)(p_buffer+8), xmm3); \ |
|---|
| 1024 | | xmm5 = _xmm_unpackhi_pi16(xmm5, xmm4); \ |
|---|
| | 1023 | xmm5 = _mm_unpackhi_epi16(xmm5, xmm4); \ |
|---|
| 1025 | 1024 | _mm_storeu_si128((__m128i*)(p_buffer+12), xmm5); \ |
|---|
| 1026 | 1025 | |
|---|