From 7361ef732b432e153496c30da66081d7e530c7f6 Mon Sep 17 00:00:00 2001 From: Peter de Rivaz Date: Mon, 14 Dec 2015 16:35:29 +0000 Subject: [PATCH] Fix for issue 1114 compile error In 32-bit build with --enable-shared, there is a lot of register pressure and register src_strideq is reused. The code needs to use the stack based version of src_stride, but this doesn't compile when used in an lea instruction. This patch also fixes a related segmentation fault caused by the implementation using src_strideq even though it has been reused. This patch also fixes the HBD subpel variance tests that fail when compiled without disable-optimizations. These failures were caused by local variables in the assembler routines colliding with the caller's stack frame. Change-Id: Ice9d4dafdcbdc6038ad5ee7c1c09a8f06deca362 --- vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm | 18 +++---- vpx_dsp/x86/highbd_variance_sse2.c | 64 ++++++++++++++---------- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm index 22d52a2..30ee81b 100644 --- a/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm +++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm @@ -79,20 +79,13 @@ SECTION .text %macro INC_SRC_BY_SRC_STRIDE 0 %if ARCH_X86=1 && CONFIG_PIC=1 - lea srcq, [srcq + src_stridemp*2] + add srcq, src_stridemp + add srcq, src_stridemp %else lea srcq, [srcq + src_strideq*2] %endif %endmacro -%macro INC_SRC_BY_SRC_2STRIDE 0 -%if ARCH_X86=1 && CONFIG_PIC=1 - lea srcq, [srcq + src_stridemp*4] -%else - lea srcq, [srcq + src_strideq*4] -%endif -%endmacro - %macro SUBPEL_VARIANCE 1-2 0 ; W %define bilin_filter_m bilin_filter_m_sse2 %define filter_idx_shift 5 @@ -984,8 +977,9 @@ SECTION .text .x_other_y_other_loop: movu m2, [srcq] movu m4, [srcq+2] - movu m3, [srcq+src_strideq*2] - movu m5, [srcq+src_strideq*2+2] + INC_SRC_BY_SRC_STRIDE + movu m3, [srcq] + movu m5, [srcq+2] pmullw m2, filter_x_a pmullw m4, filter_x_b paddw m2, filter_rnd @@ -1018,7 +1012,7 @@ SECTION .text SUM_SSE m0, m2, m4, m3, m6, m7 mova m0, m5 - INC_SRC_BY_SRC_2STRIDE + INC_SRC_BY_SRC_STRIDE lea dstq, [dstq + dst_strideq * 4] %if %2 == 1 ; avg add secq, sec_str diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c index b45331c..81ec5db 100644 --- a/vpx_dsp/x86/highbd_variance_sse2.c +++ b/vpx_dsp/x86/highbd_variance_sse2.c @@ -243,13 +243,18 @@ unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, } #if CONFIG_USE_X86INC +// The 2 unused parameters are place holders for PIC enabled build. +// These definitions are for functions defined in +// highbd_subpel_variance_impl_sse2.asm #define DECL(w, opt) \ int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \ ptrdiff_t src_stride, \ int x_offset, int y_offset, \ const uint16_t *dst, \ ptrdiff_t dst_stride, \ - int height, unsigned int *sse); + int height, \ + unsigned int *sse, \ + void *unused0, void *unused); #define DECLS(opt1, opt2) \ DECL(8, opt1); \ DECL(16, opt1) @@ -274,7 +279,7 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, h, \ - &sse); \ + &sse, NULL, NULL); \ if (w > wf) { \ unsigned int sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ @@ -282,19 +287,20 @@ uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \ x_offset, y_offset, \ dst + 16, \ dst_stride, \ - h, &sse2); \ + h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ x_offset, y_offset, \ dst + 32, dst_stride, \ - h, &sse2); \ + h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 48, src_stride, x_offset, y_offset, \ - dst + 48, dst_stride, h, &sse2); \ + dst + 48, dst_stride, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -312,7 +318,7 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \ x_offset, y_offset, \ dst, dst_stride, \ - h, &sse); \ + h, &sse, NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \ @@ -320,20 +326,21 @@ uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ x_offset, y_offset, \ dst + 16, \ dst_stride, \ - h, &sse2); \ + h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ x_offset, y_offset, \ dst + 32, dst_stride, \ - h, &sse2); \ + h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ x_offset, y_offset, \ dst + 48, dst_stride, \ - h, &sse2); \ + h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -359,27 +366,27 @@ uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 16 + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + 16 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 32 + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + 32 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, \ x_offset, y_offset, dst + 48 + (start_row * dst_stride), \ - dst_stride, height, &sse2); \ + dst_stride, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ }\ @@ -410,6 +417,7 @@ FNS(sse2, sse); #undef FNS #undef FN +// The 2 unused parameters are place holders for PIC enabled build. #define DECL(w, opt) \ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ ptrdiff_t src_stride, \ @@ -419,7 +427,8 @@ int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \ const uint16_t *sec, \ ptrdiff_t sec_stride, \ int height, \ - unsigned int *sse); + unsigned int *sse, \ + void *unused0, void *unused); #define DECLS(opt1) \ DECL(16, opt1) \ DECL(8, opt1) @@ -439,23 +448,23 @@ uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src, src_stride, x_offset, \ - y_offset, dst, dst_stride, sec, w, h, &sse); \ + y_offset, dst, dst_stride, sec, w, h, &sse, NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 16, src_stride, x_offset, y_offset, \ - dst + 16, dst_stride, sec + 16, w, h, &sse2); \ + dst + 16, dst_stride, sec + 16, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 32, src_stride, x_offset, y_offset, \ - dst + 32, dst_stride, sec + 32, w, h, &sse2); \ + dst + 32, dst_stride, sec + 32, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48, src_stride, x_offset, y_offset, \ - dst + 48, dst_stride, sec + 48, w, h, &sse2); \ + dst + 48, dst_stride, sec + 48, w, h, &sse2, NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -475,14 +484,15 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src, src_stride, x_offset, \ y_offset, dst, dst_stride, \ - sec, w, h, &sse); \ + sec, w, h, &sse, NULL, NULL); \ if (w > wf) { \ uint32_t sse2; \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 16, src_stride, \ x_offset, y_offset, \ dst + 16, dst_stride, \ - sec + 16, w, h, &sse2); \ + sec + 16, w, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ if (w > wf * 2) { \ @@ -490,14 +500,16 @@ uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ src + 32, src_stride, \ x_offset, y_offset, \ dst + 32, dst_stride, \ - sec + 32, w, h, &sse2); \ + sec + 32, w, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48, src_stride, \ x_offset, y_offset, \ dst + 48, dst_stride, \ - sec + 48, w, h, &sse2); \ + sec + 48, w, h, &sse2, \ + NULL, NULL); \ se += se2; \ sse += sse2; \ } \ @@ -525,7 +537,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + (start_row * src_stride), src_stride, x_offset, \ y_offset, dst + (start_row * dst_stride), dst_stride, \ - sec + (start_row * w), w, height, &sse2); \ + sec + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf) { \ @@ -533,7 +545,7 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ src + 16 + (start_row * src_stride), src_stride, \ x_offset, y_offset, \ dst + 16 + (start_row * dst_stride), dst_stride, \ - sec + 16 + (start_row * w), w, height, &sse2); \ + sec + 16 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ if (w > wf * 2) { \ @@ -541,14 +553,14 @@ uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ src + 32 + (start_row * src_stride), src_stride, \ x_offset, y_offset, \ dst + 32 + (start_row * dst_stride), dst_stride, \ - sec + 32 + (start_row * w), w, height, &sse2); \ + sec + 32 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ src + 48 + (start_row * src_stride), src_stride, \ x_offset, y_offset, \ dst + 48 + (start_row * dst_stride), dst_stride, \ - sec + 48 + (start_row * w), w, height, &sse2); \ + sec + 48 + (start_row * w), w, height, &sse2, NULL, NULL); \ se += se2; \ long_sse += sse2; \ } \ -- 2.7.0