[XviD-devel] Reduced-Resolution P-frame
skal
xvid-devel@xvid.org
29 Nov 2002 14:22:22 +0100
--=-+8ZDsgxyusj5Oz7xE3n0
Content-Type: text/plain
Content-Transfer-Encoding: 7bit
Hi
On Fri, 2002-11-29 at 12:38, skal wrote:
> Anyway, gimme 1 day (maybe less) to put the filtering code
> into 'xvid-like' shape and I'll post it here...
>
here is it. I think it's pretty self-explanatory an API
(erhm) -> just ask if anything looks broken/strange.
bye,
Skal
--=-+8ZDsgxyusj5Oz7xE3n0
Content-Disposition: attachment; filename=reduced_mmx.asm
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; name=reduced_mmx.asm; charset=ISO-8859-1
;/*************************************************************************=
****
; *
; * XVID MPEG-4 VIDEO CODEC
; * Reduced-Resolution utilities
; *
; * Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
; *
; * This file is part of XviD, a free MPEG-4 video encoder/decoder
; *
; * XviD is free software; you can redistribute it and/or modify it
; * under the terms of the GNU General Public License as published by
; * the Free Software Foundation; either version 2 of the License, or
; * (at your option) any later version.
; *
; * This program is distributed in the hope that it will be useful,
; * but WITHOUT ANY WARRANTY; without even the implied warranty of
; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; * GNU General Public License for more details.
; *
; * You should have received a copy of the GNU General Public License
; * along with this program; if not, write to the Free Software
; * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 U=
SA
; *
; * Under section 8 of the GNU General Public License, the copyright
; * holders of XVID explicitly forbid distribution in the following
; * countries:
; *
; * - Japan
; * - United States of America
; *
; * Linking XviD statically or dynamically with other modules is making a
; * combined work based on XviD. Thus, the terms and conditions of the
; * GNU General Public License cover the whole combination.
; *
; * As a special exception, the copyright holders of XviD give you
; * permission to link XviD with independent modules that communicate with
; * XviD solely through the VFW1.1 and DShow interfaces, regardless of the
; * license terms of these independent modules, and to copy and distribute
; * the resulting combined work under terms of your choice, provided that
; * every copy of the combined work is accompanied by a complete copy of
; * the source code of XviD (the version of XviD used to produce the
; * combined work), being distributed under the terms of the GNU General
; * Public License plus this exception. An independent module is a module
; * which is not derived from or based on XviD.
; *
; * Note that people who make modified versions of XviD are not obligated
; * to grant this special exception for their modified versions; it is
; * their choice whether to do so. The GNU General Public License gives
; * permission to release a modified version without this exception; this
; * exception also makes it possible to release a modified version which
; * carries forward this exception.
; *
; * $Id: reduced_mmx.asm,v 1.5 2002/11/17 00:32:06 edgomez Exp $
; *
; *************************************************************************=
/
bits 32
%macro cglobal 1=20
%ifdef PREFIX
global _%1=20
%define %1 _%1
%else
global %1
%endif
%endmacro
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
section .data
align 16
Up31 dw 3, 1, 3, 1
Up13 dw 1, 3, 1, 3
Up93 dw 9, 3, 9, 3
Up39 dw 3, 9, 3, 9
Cst0 dw 0, 0, 0, 0
Cst2 dw 2, 2, 2, 2
Cst3 dw 3, 3, 3, 3
Cst32 dw 32,32,32,32
Cst2000 dw 2, 0, 0, 0
Cst0002 dw 0, 0, 0, 2
Mask_ff dw 0xff,0xff,0xff,0xff
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
section .text
cglobal xvid_Copy_Upsampled_8x8_16To8_mmx
cglobal xvid_Add_Upsampled_8x8_16To8_mmx
cglobal xvid_Copy_Upsampled_8x8_16To8_xmm
cglobal xvid_Add_Upsampled_8x8_16To8_xmm
cglobal xvid_HFilter_31_mmx
cglobal xvid_VFilter_31_x86
cglobal xvid_HFilter_31_x86
cglobal xvid_Filter_18x18_To_8x8_mmx
cglobal xvid_Filter_Diff_18x18_To_8x8_mmx
;//////////////////////////////////////////////////////////////////////
;// 8x8 -> 16x16 upsampling (16b)
;//////////////////////////////////////////////////////////////////////
%macro MUL_PACK 4 ; %1/%2: regs %3/%4/%5: Up13/Up31
pmullw %1, %3 ; [Up13]
pmullw mm4, %4 ; [Up31]
pmullw %2, %3 ; [Up13]
pmullw mm5, %4 ; [Up31]
paddsw %1, [Cst2]
paddsw %2, [Cst2]
paddsw %1, mm4
paddsw %2, mm5
%endmacro
; MMX-way of reordering columns...
%macro COL03 3 ;%1/%2: regs, %3: row -output: mm4/mm5
movq %1, [edx+%3*16+0*2] ; %1 =3D 0|1|2|3
movq %2,[edx+%3*16+1*2] ; %2 =3D 1|2|3|4
movq mm5, %1 ; mm5 =3D 0|1|2|3
movq mm4, %1 ; mm4 =3D 0|1|2|3
punpckhwd mm5,%2 ; mm5 =3D 2|3|3|4
punpcklwd mm4,%2 ; mm4 =3D 0|1|1|2
punpcklwd %1,%1 ; %1 =3D 0|0|1|1
punpcklwd %2, mm5 ; %2 =3D 1|2|2|3
punpcklwd %1, mm4 ; %1 =3D 0|0|0|1
%endmacro
%macro COL47 3 ;%1-%2: regs, %3: row -output: mm4/mm5
movq mm5, [edx+%3*16+4*2] ; mm5 =3D 4|5|6|7
movq %1, [edx+%3*16+3*2] ; %1 =3D 3|4|5|6
movq %2, mm5 ; %2 =3D 4|5|6|7
movq mm4, mm5 ; mm4 =3D 4|5|6|7
punpckhwd %2, %2 ; %2 =3D 6|6|7|7
punpckhwd mm5, %2 ; mm5 =3D 6|7|7|7
movq %2, %1 ; %2 =3D 3|4|5|6
punpcklwd %1, mm4 ; %1 =3D 3|4|4|5
punpckhwd %2, mm4 ; %2 =3D 5|6|6|7
punpcklwd mm4, %2 ; mm4 =3D 4|5|5|6
%endmacro
%macro MIX_ROWS 4 ; %1/%2:prev %3/4:cur (preserved) mm4/mm5: output
; we need to perform: (%1,%3) -> (%1 =3D 3*%1+%3, mm4 =3D 3*%3+%1), %3 pr=
eserved.
movq mm4, [Cst3]
movq mm5, [Cst3]
pmullw mm4, %3
pmullw mm5, %4
paddsw mm4, %1
paddsw mm5, %2
pmullw %1, [Cst3]
pmullw %2, [Cst3]
paddsw %1, %3
paddsw %2, %4
%endmacro
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; void xvid_Copy_Upsampled_8x8_16To8_mmx(uint8_t *Dst,=20
; const int16_t *Src, const int BpS)=
;
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
; Note: we can use ">>2" instead of "/4" here, since we=20
; are (supposed to be) averaging positive values
%macro STORE_1 2
psraw %1, 2
psraw %2, 2
packuswb %1,%2 =20
movq [ecx], %1
%endmacro
%macro STORE_2 2 ; pack and store (%1,%2) + (mm4,mm5)
psraw %1, 4
psraw %2, 4
psraw mm4, 4
psraw mm5, 4
packuswb %1,%2
packuswb mm4, mm5
movq [ecx], %1
movq [ecx+eax], mm4
lea ecx, [ecx+2*eax]
%endmacro
;//////////////////////////////////////////////////////////////////////
align 16
xvid_Copy_Upsampled_8x8_16To8_mmx: ; 344c
mov ecx, [esp+4] ; Dst
mov edx, [esp+8] ; Src
mov eax, [esp+12] ; BpS
movq mm6, [Up13]
movq mm7, [Up31]
COL03 mm0, mm1, 0
MUL_PACK mm0,mm1, mm6, mm7
movq mm4, mm0
movq mm5, mm1
STORE_1 mm4, mm5
add ecx, eax
COL03 mm2, mm3, 1
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL03 mm0, mm1, 2
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL03 mm2, mm3, 3
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL03 mm0, mm1, 4
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL03 mm2, mm3, 5
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL03 mm0, mm1, 6
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL03 mm2, mm3, 7
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
STORE_1 mm2, mm3
mov ecx, [esp+4]
add ecx, 8
COL47 mm0, mm1, 0
MUL_PACK mm0,mm1, mm6, mm7
movq mm4, mm0
movq mm5, mm1
STORE_1 mm4, mm5
add ecx, eax
COL47 mm2, mm3, 1
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL47 mm0, mm1, 2
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL47 mm2, mm3, 3
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL47 mm0, mm1, 4
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL47 mm2, mm3, 5
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL47 mm0, mm1, 6
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL47 mm2, mm3, 7
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
STORE_1 mm2, mm3
ret
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; void xvid_Add_Upsampled_8x8_16To8_mmx(uint8_t *Dst,=20
; const int16_t *Src, const int BpS);
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
; Note: grrr... the 'pcmpgtw' stuff are the "/4" and "/16" operators
; implemented with ">>2" and ">>4" using:=20
; x/4 =3D ( (x-(x<0))>>2 ) + (x<0)
; x/16 =3D ( (x-(x<0))>>4 ) + (x<0)
%macro STORE_ADD_1 2
; We substract the rounder '2' for corner pixels,
; since when 'x' is negative, (x*4 + 2)/4 is *not*
; equal to 'x'. In fact, the correct relation is:
; (x*4 + 2)/4 =3D x - (x<0)
; So, better revert to (x*4)/4 =3D x.
psubsw %1, [Cst2000]
psubsw %2, [Cst0002]
pxor mm6, mm6
pxor mm7, mm7
pcmpgtw mm6, %1
pcmpgtw mm7, %2
paddsw %1, mm6
paddsw %2, mm7
psraw %1, 2
psraw %2, 2
psubsw %1, mm6
psubsw %2, mm7
; mix with destination [ecx]
movq mm6, [ecx]
movq mm7, [ecx]
punpcklbw mm6, [Cst0]
punpckhbw mm7, [Cst0]
paddsw %1, mm6
paddsw %2, mm7
packuswb %1,%2
movq [ecx], %1
%endmacro
%macro STORE_ADD_2 2
pxor mm6, mm6
pxor mm7, mm7
pcmpgtw mm6, %1
pcmpgtw mm7, %2
paddsw %1, mm6
paddsw %2, mm7
psraw %1, 4
psraw %2, 4
psubsw %1, mm6
psubsw %2, mm7
pxor mm6, mm6
pxor mm7, mm7
pcmpgtw mm6, mm4
pcmpgtw mm7, mm5
paddsw mm4, mm6
paddsw mm5, mm7
psraw mm4, 4
psraw mm5, 4
psubsw mm4, mm6
psubsw mm5, mm7
; mix with destination
movq mm6, [ecx]
movq mm7, [ecx]
punpcklbw mm6, [Cst0]
punpckhbw mm7, [Cst0]
paddsw %1, mm6
paddsw %2, mm7
movq mm6, [ecx+eax]
movq mm7, [ecx+eax]
punpcklbw mm6, [Cst0]
punpckhbw mm7, [Cst0]
paddsw mm4, mm6
paddsw mm5, mm7
packuswb %1,%2
packuswb mm4, mm5
movq [ecx], %1
movq [ecx+eax], mm4
lea ecx, [ecx+2*eax]
%endmacro
;//////////////////////////////////////////////////////////////////////
align 16
xvid_Add_Upsampled_8x8_16To8_mmx: ; 579c
mov ecx, [esp+4] ; Dst
mov edx, [esp+8] ; Src
mov eax, [esp+12] ; BpS
COL03 mm0, mm1, 0
MUL_PACK mm0,mm1, [Up13], [Up31] =20
movq mm4, mm0
movq mm5, mm1
STORE_ADD_1 mm4, mm5
add ecx, eax
COL03 mm2, mm3, 1
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL03 mm0, mm1, 2
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL03 mm2, mm3, 3
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL03 mm0, mm1, 4
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL03 mm2, mm3, 5
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL03 mm0, mm1, 6
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL03 mm2, mm3, 7
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
STORE_ADD_1 mm2, mm3
mov ecx, [esp+4]
add ecx, 8
COL47 mm0, mm1, 0
MUL_PACK mm0,mm1, [Up13], [Up31] =20
movq mm4, mm0
movq mm5, mm1 =20
STORE_ADD_1 mm4, mm5
add ecx, eax
COL47 mm2, mm3, 1
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL47 mm0, mm1, 2
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL47 mm2, mm3, 3
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL47 mm0, mm1, 4
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL47 mm2, mm3, 5
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL47 mm0, mm1, 6
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL47 mm2, mm3, 7
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
STORE_ADD_1 mm2, mm3
ret
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; void xvid_Copy_Upsampled_8x8_16To8_xmm(uint8_t *Dst,=20
; const int16_t *Src, const int BpS)=
;
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
; xmm version can take (little) advantage of 'pshufw'
%macro COL03_SSE 3 ;%1/%2: regs, %3: row -trashes mm4/mm5
movq %2, [edx+%3*16+0*2] ; <- 0|1|2|3
pshufw %1, %2, (0+0*4+0*16+1*64) ; %1 =3D 0|0|0|1
pshufw mm4, %2, (0+1*4+1*16+2*64) ; mm4=3D 0|1|1|2
pshufw %2, %2, (1+2*4+2*16+3*64) ; %2 =3D 1|2|2|3
pshufw mm5, [edx+%3*16+2*2], (0+1*4+1*16+2*64) ; mm5 =3D 2|3|3|4
%endmacro
%macro COL47_SSE 3 ;%1-%2: regs, %3: row -trashes mm4/mm5
pshufw %1, [edx+%3*16+2*2], (1+2*4+2*16+3*64) ; 3|4|4|5
movq mm5, [edx+%3*16+2*4] ; <- 4|5|6|7
pshufw mm4, mm5, (0+1*4+1*16+2*64) ; 4|5|5|6
pshufw %2, mm5, (1+2*4+2*16+3*64) ; 5|6|6|7
pshufw mm5, mm5, (2+3*4+3*16+3*64) ; 6|7|7|7
%endmacro
;//////////////////////////////////////////////////////////////////////
align 16
xvid_Copy_Upsampled_8x8_16To8_xmm: ; 315c
mov ecx, [esp+4] ; Dst
mov edx, [esp+8] ; Src
mov eax, [esp+12] ; BpS
movq mm6, [Up13]
movq mm7, [Up31]
COL03_SSE mm0, mm1, 0
MUL_PACK mm0,mm1, mm6, mm7
movq mm4, mm0
movq mm5, mm1
STORE_1 mm4, mm5
add ecx, eax
COL03_SSE mm2, mm3, 1
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL03_SSE mm0, mm1, 2
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL03_SSE mm2, mm3, 3
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL03_SSE mm0, mm1, 4
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL03_SSE mm2, mm3, 5
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL03_SSE mm0, mm1, 6
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL03_SSE mm2, mm3, 7
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
STORE_1 mm2, mm3
mov ecx, [esp+4]
add ecx, 8
COL47_SSE mm0, mm1, 0
MUL_PACK mm0,mm1, mm6, mm7
movq mm4, mm0
movq mm5, mm1
STORE_1 mm4, mm5
add ecx, eax
COL47_SSE mm2, mm3, 1
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL47_SSE mm0, mm1, 2
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL47_SSE mm2, mm3, 3
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL47_SSE mm0, mm1, 4
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL47_SSE mm2, mm3, 5
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
COL47_SSE mm0, mm1, 6
MUL_PACK mm0,mm1, mm6, mm7
MIX_ROWS mm2, mm3, mm0, mm1
STORE_2 mm2, mm3
COL47_SSE mm2, mm3, 7
MUL_PACK mm2,mm3, mm6, mm7
MIX_ROWS mm0, mm1, mm2, mm3
STORE_2 mm0, mm1
STORE_1 mm2, mm3
ret
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; void xvid_Add_Upsampled_8x8_16To8_xmm(uint8_t *Dst,=20
; const int16_t *Src, const int BpS);
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
align 16
xvid_Add_Upsampled_8x8_16To8_xmm: ; 549c
mov ecx, [esp+4] ; Dst
mov edx, [esp+8] ; Src
mov eax, [esp+12] ; BpS
COL03_SSE mm0, mm1, 0
MUL_PACK mm0,mm1, [Up13], [Up31] =20
movq mm4, mm0
movq mm5, mm1
STORE_ADD_1 mm4, mm5
add ecx, eax
COL03_SSE mm2, mm3, 1
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL03_SSE mm0, mm1, 2
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL03_SSE mm2, mm3, 3
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL03_SSE mm0, mm1, 4
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL03_SSE mm2, mm3, 5
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL03_SSE mm0, mm1, 6
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL03_SSE mm2, mm3, 7
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
STORE_ADD_1 mm2, mm3
mov ecx, [esp+4]
add ecx, 8
COL47_SSE mm0, mm1, 0
MUL_PACK mm0,mm1, [Up13], [Up31] =20
movq mm4, mm0
movq mm5, mm1 =20
STORE_ADD_1 mm4, mm5
add ecx, eax
COL47_SSE mm2, mm3, 1
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL47_SSE mm0, mm1, 2
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL47_SSE mm2, mm3, 3
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL47_SSE mm0, mm1, 4
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL47_SSE mm2, mm3, 5
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
COL47_SSE mm0, mm1, 6
MUL_PACK mm0,mm1, [Up13], [Up31]
MIX_ROWS mm2, mm3, mm0, mm1
STORE_ADD_2 mm2, mm3
COL47_SSE mm2, mm3, 7
MUL_PACK mm2,mm3, [Up13], [Up31]
MIX_ROWS mm0, mm1, mm2, mm3
STORE_ADD_2 mm0, mm1
STORE_ADD_1 mm2, mm3
ret
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; void xvid_HFilter_31_mmx(uint8_t *Src1, uint8_t *Src2, int Nb_Blks);
; void xvid_VFilter_31_x86(uint8_t *Src1, uint8_t *Src2, const int BpS, int=
Nb_Blks);
; void xvid_HFilter_31_x86(uint8_t *Src1, uint8_t *Src2, int Nb_Blks);
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;//////////////////////////////////////////////////////////////////////
;// horizontal/vertical filtering: [x,y] -> [ (3x+y+2)>>2, (x+3y+2)>>2 ]
;//
;// We use the trick: tmp =3D (x+y+2) -> [x =3D (tmp+2x)>>2, y =3D (tmp+2y)=
>>2]
;//////////////////////////////////////////////////////////////////////
align 16
xvid_HFilter_31_mmx:
push esi
push edi
mov esi, [esp+4 +8] ; Src1
mov edi, [esp+8 +8] ; Src2
mov eax, [esp+12 +8] ; Nb_Blks
lea eax,[eax*2]
movq mm5, [Cst2]
pxor mm7, mm7
lea esi, [esi+eax*4]
lea edi, [edi+eax*4]
neg eax
.Loop: ;12c
movd mm0, [esi+eax*4] =20
movd mm1, [edi+eax*4]
movq mm2, mm5 =20
punpcklbw mm0, mm7
punpcklbw mm1, mm7
paddsw mm2, mm0
paddsw mm0, mm0
paddsw mm2, mm1
paddsw mm1, mm1
paddsw mm0, mm2
paddsw mm1, mm2
psraw mm0, 2
psraw mm1, 2
packuswb mm0, mm7
packuswb mm1, mm7
movd [esi+eax*4], mm0
movd [edi+eax*4], mm1
add eax,1
jl .Loop
pop edi
pop esi
ret
; mmx is of no use here. Better use plain ASM. Moreover,
; this is for the fun of ASM coding, coz' every modern compiler can
; end up with a code that looks very much like this one...
align 16
xvid_VFilter_31_x86:
push esi
push edi
push ebx
push ebp
mov esi, [esp+4 +16] ; Src1
mov edi, [esp+8 +16] ; Src2
mov ebp, [esp+12 +16] ; BpS
mov eax, [esp+16 +16] ; Nb_Blks
lea eax,[eax*8]
.Loop: ;7c
movzx ecx, byte [esi]
movzx edx, byte [edi]
lea ebx, [ecx+edx+2]
lea ecx,[ebx+2*ecx]
lea edx,[ebx+2*edx]
shr ecx,2
shr edx,2
mov [esi], cl
mov [edi], dl
lea esi, [esi+ebp]
lea edi, [edi+ebp]
dec eax
jg .Loop
pop ebp
pop ebx
pop edi
pop esi
ret
; this one's just a little faster than gcc's code. Very little.
align 16
xvid_HFilter_31_x86:
push esi
push edi
push ebx
mov esi, [esp+4 +12] ; Src1
mov edi, [esp+8 +12] ; Src2
mov eax, [esp+12 +12] ; Nb_Blks
lea eax,[eax*8]
lea esi, [esi+eax]
lea edi, [esi+eax]
neg eax
.Loop: ; 6c
movzx ecx, byte [esi+eax]
movzx edx, byte [edi+eax]
lea ebx, [ecx+edx+2]
lea ecx,[ebx+2*ecx]
lea edx,[ebx+2*edx]
shr ecx,2
shr edx,2
mov [esi+eax], cl
mov [edi+eax], dl
inc eax
jl .Loop
pop ebx
pop edi
pop esi
ret
;//////////////////////////////////////////////////////////////////////
;// 16b downsampling 16x16 -> 8x8
;//////////////////////////////////////////////////////////////////////
%macro HFILTER_1331 2 ;%1:src %2:dst reg. -trashes mm0/mm1/mm2
movq mm2, [Mask_ff]
movq %2, [%1-1] ;-10123456
movq mm0, [%1] ; 01234567
movq mm1, [%1+1] ; 12345678
pand %2, mm2 ;-1|1|3|5
pand mm0, mm2 ; 0|2|4|6
pand mm1, mm2 ; 1|3|5|7
pand mm2, [%1+2] ; 2|4|6|8
paddusw mm0, mm1
paddusw %2, mm2
pmullw mm0, mm7
paddusw %2, mm0
%endmacro
%macro VFILTER_1331 4 ; %1-4: regs %1-%2: trashed
paddsw %1, [Cst32]
paddsw %2, %3 =20
pmullw %2, mm7
paddsw %1,%4
paddsw %1, %2
psraw %1, 6
%endmacro
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; void xvid_Filter_18x18_To_8x8_mmx(int16_t *Dst,
; const uint8_t *Src, const int BpS);
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
%macro COPY_TWO_LINES_1331 1 ; %1: dst
HFILTER_1331 edx , mm5
HFILTER_1331 edx+eax, mm6
lea edx, [edx+2*eax]
VFILTER_1331 mm3,mm4,mm5, mm6
movq [%1], mm3
HFILTER_1331 edx , mm3
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
VFILTER_1331 mm5,mm6,mm3,mm4
movq [%1+16], mm5
%endmacro
align 16
xvid_Filter_18x18_To_8x8_mmx: ; 283c (~4.4c per output pixel)
mov ecx, [esp+4] ; Dst
mov edx, [esp+8] ; Src
mov eax, [esp+12] ; BpS
movq mm7, [Cst3]
sub edx, eax
; mm3/mm4/mm5/mm6 is used as a 4-samples delay line.
; process columns 0-3
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
COPY_TWO_LINES_1331 ecx + 0*16
COPY_TWO_LINES_1331 ecx + 2*16
COPY_TWO_LINES_1331 ecx + 4*16
COPY_TWO_LINES_1331 ecx + 6*16
; process columns 4-7
mov edx, [esp+8]
sub edx, eax
add edx, 8
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
COPY_TWO_LINES_1331 ecx + 0*16 +8
COPY_TWO_LINES_1331 ecx + 2*16 +8
COPY_TWO_LINES_1331 ecx + 4*16 +8
COPY_TWO_LINES_1331 ecx + 6*16 +8
ret
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
;
; void xvid_Filter_Diff_18x18_To_8x8_mmx(int16_t *Dst,
; const uint8_t *Src, const int BpS)=
;
;
;=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=
=3D
%macro DIFF_TWO_LINES_1331 1 ; %1: dst
HFILTER_1331 edx , mm5
HFILTER_1331 edx+eax, mm6
lea edx, [edx+2*eax]
movq mm2, [%1]
VFILTER_1331 mm3,mm4,mm5, mm6
psubsw mm2, mm3
movq [%1], mm2
HFILTER_1331 edx , mm3
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
movq mm2, [%1+16]
VFILTER_1331 mm5,mm6,mm3,mm4
psubsw mm2, mm5
movq [%1+16], mm2
%endmacro
align 16
xvid_Filter_Diff_18x18_To_8x8_mmx: ; 302c
mov ecx, [esp+4] ; Dst
mov edx, [esp+8] ; Src
mov eax, [esp+12] ; BpS
movq mm7, [Cst3]
sub edx, eax
; mm3/mm4/mm5/mm6 is used as a 4-samples delay line.
; process columns 0-3
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
DIFF_TWO_LINES_1331 ecx + 0*16
DIFF_TWO_LINES_1331 ecx + 2*16
DIFF_TWO_LINES_1331 ecx + 4*16
DIFF_TWO_LINES_1331 ecx + 6*16
; process columns 4-7
mov edx, [esp+8]
sub edx, eax
add edx, 8
HFILTER_1331 edx , mm3 ; pre-load mm3/mm4
HFILTER_1331 edx+eax, mm4
lea edx, [edx+2*eax]
DIFF_TWO_LINES_1331 ecx + 0*16 +8
DIFF_TWO_LINES_1331 ecx + 2*16 +8
DIFF_TWO_LINES_1331 ecx + 4*16 +8
DIFF_TWO_LINES_1331 ecx + 6*16 +8
ret
;//////////////////////////////////////////////////////////////////////
; pfeewwww... Never Do That On Stage Again. :)
--=-+8ZDsgxyusj5Oz7xE3n0
Content-Disposition: attachment; filename=reduced.c
Content-Transfer-Encoding: quoted-printable
Content-Type: text/x-c; name=reduced.c; charset=ISO-8859-1
/**************************************************************************=
***
*
* XVID MPEG-4 VIDEO CODEC
* Reduced-Resolution utilities
*
* Copyright(C) 2002 Pascal Massimino <skal@planet-d.net>
*
* This file is part of XviD, a free MPEG-4 video encoder/decoder
*
* XviD is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 US=
A
*
* Under section 8 of the GNU General Public License, the copyright
* holders of XVID explicitly forbid distribution in the following
* countries:
*
* - Japan
* - United States of America
*
* Linking XviD statically or dynamically with other modules is making a
* combined work based on XviD. Thus, the terms and conditions of the
* GNU General Public License cover the whole combination.
*
* As a special exception, the copyright holders of XviD give you
* permission to link XviD with independent modules that communicate with
* XviD solely through the VFW1.1 and DShow interfaces, regardless of the
* license terms of these independent modules, and to copy and distribute
* the resulting combined work under terms of your choice, provided that
* every copy of the combined work is accompanied by a complete copy of
* the source code of XviD (the version of XviD used to produce the
* combined work), being distributed under the terms of the GNU General
* Public License plus this exception. An independent module is a module
* which is not derived from or based on XviD.
*
* Note that people who make modified versions of XviD are not obligated
* to grant this special exception for their modified versions; it is
* their choice whether to do so. The GNU General Public License gives
* permission to release a modified version without this exception; this
* exception also makes it possible to release a modified version which
* carries forward this exception.
*
* $Id: reduced.c,v 1.11 2002/11/17 00:32:06 edgomez Exp $
*
**************************************************************************=
**/
#include "portab.h"
/* prototypes */
// decoding
extern void xvid_Copy_Upsampled_8x8_16To8_C(uint8_t *Dst, const int16_t *Sr=
c, const int BpS);
extern void xvid_Add_Upsampled_8x8_16To8_C(uint8_t *Dst, const int16_t *Src=
, const int BpS);
/* Note: "Nb"_Blks is the number of 8-pixels blocks to process */
extern void xvid_HFilter_31_C(uint8_t *Src1, uint8_t *Src2, int Nb_Blks);
extern void xvid_VFilter_31_C(uint8_t *Src1, uint8_t *Src2, const int BpS, =
int Nb_Blks);
/* MMX version */
extern void xvid_Copy_Upsampled_8x8_16To8_mmx(uint8_t *Dst, const int16_t *=
Src, const int BpS);
extern void xvid_Add_Upsampled_8x8_16To8_mmx(uint8_t *Dst, const int16_t *S=
rc, const int BpS);
extern void xvid_HFilter_31_mmx(uint8_t *Src1, uint8_t *Src2, int Nb_Blks);
extern void xvid_VFilter_31_x86(uint8_t *Src1, uint8_t *Src2, const int BpS=
, int Nb_Blks);
extern void xvid_HFilter_31_x86(uint8_t *Src1, uint8_t *Src2, int Nb_Blks);
/* SSE version */
extern void xvid_Copy_Upsampled_8x8_16To8_xmm(uint8_t *Dst, const int16_t *=
Src, const int BpS);
extern void xvid_Add_Upsampled_8x8_16To8_xmm(uint8_t *Dst, const int16_t *S=
rc, const int BpS);
// encoding
/* WARNING! These read 1 pixel outside of the input 16x16 block! */
extern void xvid_Filter_18x18_To_8x8_C(int16_t *Dst, const uint8_t *Src, co=
nst int BpS);
extern void xvid_Filter_Diff_18x18_To_8x8_C(int16_t *Dst, const uint8_t *Sr=
c, const int BpS);
/* MMX version */
extern void xvid_Filter_18x18_To_8x8_mmx(int16_t *Dst, const uint8_t *Src, =
const int BpS);
extern void xvid_Filter_Diff_18x18_To_8x8_mmx(int16_t *Dst, const uint8_t *=
Src, const int BpS);
//////////////////////////////////////////////////////////
// Upsampling (1/3/3/1) filter
#define CLIP(x) ((x)<0 ? 0 : (x)>255 ? 255 : (x))
#define ADD(dst,src) (dst) =3D CLIP((dst)+(src))
static inline void Filter_31(uint8_t *Dst1, uint8_t *Dst2,
const int16_t *Src1, const int16_t *Src2)
{
/* Src[] is assumed to be >=3D0. So we can use ">>2" instead of "/2" */
int16_t a =3D (3*Src1[0]+ Src2[0]+2) >> 2;
int16_t b =3D ( Src1[0]+3*Src2[0]+2) >> 2;
Dst1[0] =3D CLIP(a);
Dst2[0] =3D CLIP(b);
}
static inline void Filter_9331(uint8_t *Dst1, uint8_t *Dst2,
const int16_t *Src1, const int16_t *Src2)
{
/* Src[] is assumed to be >=3D0. So we can use ">>4" instead of "/16" *=
/
int16_t a =3D (9*Src1[0]+ 3*Src1[1]+ 3*Src2[0] + 1*Src2[1] + 8) >> 4;
int16_t b =3D (3*Src1[0]+ 9*Src1[1]+ 1*Src2[0] + 3*Src2[1] + 8) >> 4;
int16_t c =3D (3*Src1[0]+ 1*Src1[1]+ 9*Src2[0] + 3*Src2[1] + 8) >> 4;
int16_t d =3D (1*Src1[0]+ 3*Src1[1]+ 3*Src2[0] + 9*Src2[1] + 8) >> 4;
Dst1[0] =3D CLIP(a);
Dst1[1] =3D CLIP(b);
Dst2[0] =3D CLIP(c);
Dst2[1] =3D CLIP(d);
}
void xvid_Copy_Upsampled_8x8_16To8_C(uint8_t *Dst, const int16_t *Src, cons=
t int BpS)
{
int x, y;
Dst[0] =3D CLIP(Src[0]);
for(x=3D0; x<7; ++x) Filter_31(Dst+2*x+1, Dst+2*x+2, Src+x, Src+x+1);
Dst[15] =3D CLIP(Src[7]);
Dst +=3D BpS;
for(y=3D0; y<7; ++y) {
uint8_t *const Dst2 =3D Dst + BpS;
Filter_31(Dst, Dst2, Src, Src+8);
for(x=3D0; x<7; ++x)
Filter_9331(Dst+2*x+1, Dst2+2*x+1, Src+x, Src+x+8);
Filter_31(Dst+15, Dst2+15, Src+7, Src+7+8);
Src +=3D 8;=20
Dst +=3D 2*BpS;
}
Dst[0] =3D CLIP(Src[0]);
for(x=3D0; x<7; ++x) Filter_31(Dst+2*x+1, Dst+2*x+2, Src+x, Src+x+1);
Dst[15] =3D CLIP(Src[7]);
}
static inline void Filter_Add_31(uint8_t *Dst1, uint8_t *Dst2,
const int16_t *Src1, const int16_t *Src2)
{
/* Here, we must use "/4", since Src[] is in [-256, 255] */
int16_t a =3D (3*Src1[0]+ Src2[0] + 2) / 4;
int16_t b =3D ( Src1[0]+3*Src2[0] + 2) / 4;
ADD(Dst1[0], a);
ADD(Dst2[0], b);
}
static inline void Filter_Add_9331(uint8_t *Dst1, uint8_t *Dst2,
const int16_t *Src1, const int16_t *Src2=
)
{
int16_t a =3D (9*Src1[0]+ 3*Src1[1]+ 3*Src2[0] + 1*Src2[1] + 8) / 16;
int16_t b =3D (3*Src1[0]+ 9*Src1[1]+ 1*Src2[0] + 3*Src2[1] + 8) / 16;
int16_t c =3D (3*Src1[0]+ 1*Src1[1]+ 9*Src2[0] + 3*Src2[1] + 8) / 16;
int16_t d =3D (1*Src1[0]+ 3*Src1[1]+ 3*Src2[0] + 9*Src2[1] + 8) / 16;
ADD(Dst1[0], a);
ADD(Dst1[1], b);
ADD(Dst2[0], c);
ADD(Dst2[1], d);
}
void xvid_Add_Upsampled_8x8_16To8_C(uint8_t *Dst, const int16_t *Src, const=
int BpS)
{
int x, y;
ADD(Dst[0], Src[0]);
for(x=3D0; x<7; ++x) Filter_Add_31(Dst+2*x+1, Dst+2*x+2, Src+x, Src+x+1);
ADD(Dst[15], Src[7]);
Dst +=3D BpS;
for(y=3D0; y<7; ++y) {
uint8_t *const Dst2 =3D Dst + BpS;
Filter_Add_31(Dst, Dst2, Src, Src+8);
for(x=3D0; x<7; ++x)
Filter_Add_9331(Dst+2*x+1, Dst2+2*x+1, Src+x, Src+x+8);
Filter_Add_31(Dst+15, Dst2+15, Src+7, Src+7+8);
Src +=3D 8;=20
Dst +=3D 2*BpS;
}
ADD(Dst[0], Src[0]);
for(x=3D0; x<7; ++x) Filter_Add_31(Dst+2*x+1, Dst+2*x+2, Src+x, Src+x+1);
ADD(Dst[15], Src[7]);
}
#undef CLIP
#undef ADD
//////////////////////////////////////////////////////////
// horizontal and vertical deblocking
void xvid_HFilter_31_C(uint8_t *Src1, uint8_t *Src2, int Nb_Blks)
{
Nb_Blks *=3D 8;
while(Nb_Blks-->0) {
uint8_t a =3D ( 3*Src1[0] + 1*Src2[0] + 2 ) >> 2;
uint8_t b =3D ( 1*Src1[0] + 3*Src2[0] + 2 ) >> 2;
*Src1++ =3D a;
*Src2++ =3D b;
}
}
void xvid_VFilter_31_C(uint8_t *Src1, uint8_t *Src2, const int BpS, int Nb_=
Blks)
{
Nb_Blks *=3D 8;
while(Nb_Blks-->0) {
uint8_t a =3D ( 3*Src1[0] + 1*Src2[0] + 2 ) >> 2;
uint8_t b =3D ( 1*Src1[0] + 3*Src2[0] + 2 ) >> 2;
*Src1 =3D a;
*Src2 =3D b;
Src1 +=3D BpS;
Src2 +=3D BpS;
}
}
//////////////////////////////////////////////////////////
// 16x16 -> 8x8 (1/3/3/1) downsampling
//
// Warning! These read 1 pixel outside of the input 16x16 block!
//
//////////////////////////////////////////////////////////
void xvid_Filter_18x18_To_8x8_C(int16_t *Dst, const uint8_t *Src, const int=
BpS)
{
int16_t *T, Tmp[18*8];
int i, j;
T =3D Tmp;
Src -=3D BpS;
for(j=3D-1; j<17; j++) {
for(i=3D0; i<8; ++i)
T[i] =3D Src[2*i-1] + 3*Src[2*i+0] + 3*Src[2*i+1] + Src[2*i+2];
T +=3D 8;
Src +=3D BpS;
}
T =3D Tmp + 8;
for(j=3D0; j<8; j++) {
for(i=3D0; i<8; ++i)
Dst[i] =3D ( T[-8+i] + 3*T[0+i] + 3*T[8+i] + T[16+i] + 32 ) / 64;
Dst +=3D 8;
T +=3D 16;
}
}
void xvid_Filter_Diff_18x18_To_8x8_C(int16_t *Dst, const uint8_t *Src, cons=
t int BpS)
{
int16_t *T, Tmp[18*8];
int i, j;
T =3D Tmp;
Src -=3D BpS;
for(j=3D-1; j<17; j++) {
for(i=3D0; i<8; ++i)
T[i] =3D Src[2*i-1] + 3*Src[2*i+0] + 3*Src[2*i+1] + Src[2*i+2];
T +=3D 8;
Src +=3D BpS;
}
T =3D Tmp;
for(j=3D0; j<8; j++) {
for(i=3D0; i<8; ++i)
Dst[i] -=3D ( T[i] + 3*T[8+i] + 3*T[16+i] + T[24+i] + 32 ) / 64;
Dst +=3D 8;
T +=3D 16;
}
}
//////////////////////////////////////////////////////////
--=-+8ZDsgxyusj5Oz7xE3n0--