[XviD-devel] Decoder performance
Edouard Gomez
ed.gomez at free.fr
Sat Aug 9 02:21:34 CEST 2003
Edouard Gomez (ed.gomez at free.fr) wrote:
> I must admit i had a very bad surprise looking at XviD performance
> compared to libavcodec...
> [...]
> XviD CSP INTERNAL Buffers (that's a bit faster):
> BENCHMARKs: VC: 68,771s VO: 0,018s A: 0,000s Sys: 1,732s = 70,520s
>
> libavcodec (that's a lot faster):
> BENCHMARKs: VC: 27,239s VO: 0,014s A: 0,000s Sys: 3,127s = 30,380s
A bit better now, i removed lot of branches in the decoder code, most of
them were in the bframe mb decoding functions. It's a very small change
but it gives good results. See:
BENCHMARKs: VC: 49,806s VO: 0,020s A: 0,000s Sys: 1,756s = 51,583s
~20s speedup on this terrible trailer for codecs (it's coded at 5Mbps
and it's 1000x540 in size, 151,3s long)
Who's next to propose a speedup patch ?
PS: i would like someone test this with qpel (as most code changes
affect qpel vectors). It runs fine for me, but i prefer someone double
check my patch.
--
Edouard Gomez-------------- next part --------------
--- orig/src/decoder.c
+++ mod/src/decoder.c
@@ -266,6 +266,7 @@
uint32_t i;
uint32_t iQuant = pMB->quant;
uint8_t *pY_Cur, *pU_Cur, *pV_Cur;
+ quant_intraFuncPtr dequant = (dec->quant_type == 0) ? dequant_intra : dequant4_intra;
if (reduced_resolution) {
pY_Cur = dec->cur.y + (y_pos << 5) * stride + (x_pos << 5);
@@ -326,11 +327,7 @@
stop_prediction_timer();
start_timer();
- if (dec->quant_type == 0) {
- dequant_intra(&data[i * 64], &block[i * 64], iQuant, iDcScaler);
- } else {
- dequant4_intra(&data[i * 64], &block[i * 64], iQuant, iDcScaler);
- }
+ dequant(&data[i * 64], &block[i * 64], iQuant, iDcScaler);
stop_iquant_timer();
start_timer();
@@ -395,6 +392,8 @@
int uv_dx, uv_dy;
VECTOR mv[4]; /* local copy of mvs */
+ dequanth263_interFuncPtr dequant = (dec->quant_type == 0) ? dequant_inter : dequant4_inter;
+ const int direction = dec->alternate_vertical_scan ? 2 : 0;
if (reduced_resolution) {
pY_Cur = dec->cur.y + (y_pos << 5) * stride + (x_pos << 5);
@@ -414,8 +413,8 @@
if (pMB->mode == MODE_INTER || pMB->mode == MODE_INTER_Q) {
- uv_dx = mv[0].x / (1 + dec->quarterpel);
- uv_dy = mv[0].y / (1 + dec->quarterpel);
+ uv_dx = mv[0].x >> dec->quarterpel;
+ uv_dy = mv[0].y >> dec->quarterpel;
uv_dx = (uv_dx >> 1) + roundtab_79[uv_dx & 0x3];
uv_dy = (uv_dy >> 1) + roundtab_79[uv_dy & 0x3];
@@ -453,18 +452,12 @@
} else { /* MODE_INTER4V */
int sum;
- if(dec->quarterpel)
- sum = (mv[0].x / 2) + (mv[1].x / 2) + (mv[2].x / 2) + (mv[3].x / 2);
- else
- sum = mv[0].x + mv[1].x + mv[2].x + mv[3].x;
-
+ sum = mv[0].x + mv[1].x + mv[2].x + mv[3].x;
+ sum >>= dec->quarterpel;
uv_dx = (sum >> 3) + roundtab_76[sum & 0xf];
- if(dec->quarterpel)
- sum = (mv[0].y / 2) + (mv[1].y / 2) + (mv[2].y / 2) + (mv[3].y / 2);
- else
- sum = mv[0].y + mv[1].y + mv[2].y + mv[3].y;
-
+ sum = mv[0].y + mv[1].y + mv[2].y + mv[3].y;
+ sum >>= dec->quarterpel;
uv_dy = (sum >> 3) + roundtab_76[sum & 0xf];
start_timer();
@@ -521,10 +514,9 @@
}
for (i = 0; i < 6; i++) {
- int direction = dec->alternate_vertical_scan ? 2 : 0;
- if (cbp & (1 << (5 - i))) /* coded */
- {
+ /* coded */
+ if (cbp & (1 << (5 - i))) {
memset(&block[i * 64], 0, 64 * sizeof(int16_t)); /* clear */
start_timer();
@@ -532,11 +524,7 @@
stop_coding_timer();
start_timer();
- if (dec->quant_type == 0) {
- dequant_inter(&data[i * 64], &block[i * 64], iQuant);
- } else {
- dequant4_inter(&data[i * 64], &block[i * 64], iQuant);
- }
+ dequant(&data[i * 64], &block[i * 64], iQuant);
stop_iquant_timer();
start_timer();
@@ -609,6 +597,7 @@
uint8_t *const pY_Cur=dec->cur.y + (y_pos << 4) * stride + (x_pos << 4);
uint8_t *const pU_Cur=dec->cur.u + (y_pos << 3) * stride2 + (x_pos << 3);
uint8_t *const pV_Cur=dec->cur.v + (y_pos << 3) * stride2 + (x_pos << 3);
+ dequanth263_interFuncPtr dequant = (dec->quant_type == 0) ? dequant_inter : dequant4_inter;
pMB->mvs[0] = pMB->mvs[1] = pMB->mvs[2] = pMB->mvs[3] = pMB->amv;
@@ -648,8 +637,8 @@
for (i = 0; i < 6; i++) {
int direction = dec->alternate_vertical_scan ? 2 : 0;
- if (cbp & (1 << (5 - i))) /* coded */
- {
+ /* coded */
+ if (cbp & (1 << (5 - i))) {
memset(&block[i * 64], 0, 64 * sizeof(int16_t)); /* clear */
start_timer();
@@ -657,11 +646,7 @@
stop_coding_timer();
start_timer();
- if (dec->quant_type == 0) {
- dequant_inter(&data[i * 64], &block[i * 64], iQuant);
- } else {
- dequant4_inter(&data[i * 64], &block[i * 64], iQuant);
- }
+ dequant(&data[i * 64], &block[i * 64], iQuant);
stop_iquant_timer();
start_timer();
@@ -1136,6 +1121,8 @@
uint32_t iQuant = pMB->quant;
uint8_t *pY_Cur, *pU_Cur, *pV_Cur;
int uv_dx, uv_dy;
+ const int direction = dec->alternate_vertical_scan ? 2 : 0;
+ dequanth263_interFuncPtr dequant = (dec->quant_type == 0) ? dequant_inter : dequant4_inter;
pY_Cur = dec->cur.y + (y_pos << 4) * stride + (x_pos << 4);
pU_Cur = dec->cur.u + (y_pos << 3) * stride2 + (x_pos << 3);
@@ -1146,29 +1133,20 @@
uv_dx = pMB->mvs[0].x;
uv_dy = pMB->mvs[0].y;
- if (dec->quarterpel)
- {
- uv_dx /= 2;
- uv_dy /= 2;
- }
+ uv_dx >>= dec->quarterpel;
+ uv_dy >>= dec->quarterpel;
uv_dx = (uv_dx >> 1) + roundtab_79[uv_dx & 0x3];
uv_dy = (uv_dy >> 1) + roundtab_79[uv_dy & 0x3];
} else {
int sum;
- if(dec->quarterpel)
- sum = (pMB->mvs[0].x / 2) + (pMB->mvs[1].x / 2) + (pMB->mvs[2].x / 2) + (pMB->mvs[3].x / 2);
- else
- sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
-
+ sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
+ sum >>= dec->quarterpel;
uv_dx = (sum >> 3) + roundtab_76[sum & 0xf];
- if(dec->quarterpel)
- sum = (pMB->mvs[0].y / 2) + (pMB->mvs[1].y / 2) + (pMB->mvs[2].y / 2) + (pMB->mvs[3].y / 2);
- else
- sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
-
+ sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
+ sum >>= dec->quarterpel;
uv_dy = (sum >> 3) + roundtab_76[sum & 0xf];
}
@@ -1177,8 +1155,7 @@
interpolate16x16_quarterpel(dec->cur.y, dec->refn[ref].y, dec->qtmp.y, dec->qtmp.y + 64,
dec->qtmp.y + 128, 16*x_pos, 16*y_pos,
pMB->mvs[0].x, pMB->mvs[0].y, stride, 0);
- }
- else {
+ } else {
interpolate8x8_switch(dec->cur.y, dec->refn[ref].y, 16*x_pos, 16*y_pos,
pMB->mvs[0].x, pMB->mvs[0].y, stride, 0);
interpolate8x8_switch(dec->cur.y, dec->refn[ref].y, 16*x_pos + 8, 16*y_pos,
@@ -1196,10 +1173,9 @@
stop_comp_timer();
for (i = 0; i < 6; i++) {
- int direction = dec->alternate_vertical_scan ? 2 : 0;
- if (cbp & (1 << (5 - i))) /* coded */
- {
+ /* coded */
+ if (cbp & (1 << (5 - i))) {
memset(&block[i * 64], 0, 64 * sizeof(int16_t)); /* clear */
start_timer();
@@ -1207,11 +1183,7 @@
stop_coding_timer();
start_timer();
- if (dec->quant_type == 0) {
- dequant_inter(&data[i * 64], &block[i * 64], iQuant);
- } else {
- dequant4_inter(&data[i * 64], &block[i * 64], iQuant);
- }
+ dequant(&data[i * 64], &block[i * 64], iQuant);
stop_iquant_timer();
start_timer();
@@ -1264,6 +1236,8 @@
uint32_t i;
uint8_t *pY_Cur, *pU_Cur, *pV_Cur;
const uint32_t cbp = pMB->cbp;
+ const int direction = dec->alternate_vertical_scan ? 2 : 0;
+ dequanth263_interFuncPtr dequant = (dec->quant_type == 0) ? dequant_inter : dequant4_inter;
pY_Cur = dec->cur.y + (y_pos << 4) * stride + (x_pos << 4);
pU_Cur = dec->cur.u + (y_pos << 3) * stride2 + (x_pos << 3);
@@ -1277,14 +1251,11 @@
b_uv_dx = pMB->b_mvs[0].x;
b_uv_dy = pMB->b_mvs[0].y;
- if (dec->quarterpel)
- {
- uv_dx /= 2;
- uv_dy /= 2;
+ uv_dx >>= dec->quarterpel;
+ uv_dy >>= dec->quarterpel;
- b_uv_dx /= 2;
- b_uv_dy /= 2;
- }
+ b_uv_dx /= 2;
+ b_uv_dy /= 2;
uv_dx = (uv_dx >> 1) + roundtab_79[uv_dx & 0x3];
uv_dy = (uv_dy >> 1) + roundtab_79[uv_dy & 0x3];
@@ -1294,33 +1265,20 @@
} else {
int sum;
- if(dec->quarterpel)
- sum = (pMB->mvs[0].x / 2) + (pMB->mvs[1].x / 2) + (pMB->mvs[2].x / 2) + (pMB->mvs[3].x / 2);
- else
- sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
-
+ sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
+ sum >>= dec->quarterpel;
uv_dx = (sum >> 3) + roundtab_76[sum & 0xf];
- if(dec->quarterpel)
- sum = (pMB->mvs[0].y / 2) + (pMB->mvs[1].y / 2) + (pMB->mvs[2].y / 2) + (pMB->mvs[3].y / 2);
- else
- sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
-
+ sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
+ sum >>= dec->quarterpel;
uv_dy = (sum >> 3) + roundtab_76[sum & 0xf];
-
- if(dec->quarterpel)
- sum = (pMB->b_mvs[0].x / 2) + (pMB->b_mvs[1].x / 2) + (pMB->b_mvs[2].x / 2) + (pMB->b_mvs[3].x / 2);
- else
- sum = pMB->b_mvs[0].x + pMB->b_mvs[1].x + pMB->b_mvs[2].x + pMB->b_mvs[3].x;
-
+ sum = pMB->b_mvs[0].x + pMB->b_mvs[1].x + pMB->b_mvs[2].x + pMB->b_mvs[3].x;
+ sum >>= dec->quarterpel;
b_uv_dx = (sum >> 3) + roundtab_76[sum & 0xf];
- if(dec->quarterpel)
- sum = (pMB->b_mvs[0].y / 2) + (pMB->b_mvs[1].y / 2) + (pMB->b_mvs[2].y / 2) + (pMB->b_mvs[3].y / 2);
- else
- sum = pMB->b_mvs[0].y + pMB->b_mvs[1].y + pMB->b_mvs[2].y + pMB->b_mvs[3].y;
-
+ sum = pMB->b_mvs[0].y + pMB->b_mvs[1].y + pMB->b_mvs[2].y + pMB->b_mvs[3].y;
+ sum >>= dec->quarterpel;
b_uv_dy = (sum >> 3) + roundtab_76[sum & 0xf];
}
@@ -1436,8 +1394,6 @@
stop_comp_timer();
for (i = 0; i < 6; i++) {
- int direction = dec->alternate_vertical_scan ? 2 : 0;
-
if (cbp & (1 << (5 - i))) /* coded */
{
memset(&block[i * 64], 0, 64 * sizeof(int16_t)); /* clear */
@@ -1447,11 +1403,7 @@
stop_coding_timer();
start_timer();
- if (dec->quant_type == 0) {
- dequant_inter(&data[i * 64], &block[i * 64], iQuant);
- } else {
- dequant4_inter(&data[i * 64], &block[i * 64], iQuant);
- }
+ dequant(&data[i * 64], &block[i * 64], iQuant);
stop_iquant_timer();
start_timer();
More information about the XviD-devel
mailing list