[XviD-devel] Decoder performance

Edouard Gomez ed.gomez at free.fr
Sat Aug 9 02:21:34 CEST 2003


Edouard Gomez (ed.gomez at free.fr) wrote:
> I  must admit  i had  a very  bad surprise  looking at  XviD performance
> compared to libavcodec... 
> [...]
> XviD CSP INTERNAL Buffers (that's a bit faster):
> BENCHMARKs: VC: 68,771s VO: 0,018s A: 0,000s Sys: 1,732s = 70,520s
> 
> libavcodec (that's a lot faster):
> BENCHMARKs: VC: 27,239s VO: 0,014s A: 0,000s Sys: 3,127s = 30,380s

A bit better now, i removed lot of branches in the decoder code, most of
them were in the bframe mb decoding functions. It's a very small change
but it gives good results. See:

BENCHMARKs: VC: 49,806s VO: 0,020s A: 0,000s Sys: 1,756s = 51,583s

~20s speedup  on this terrible trailer  for codecs (it's  coded at 5Mbps
and it's 1000x540 in size, 151,3s long)

Who's next to propose a speedup patch ? 

PS: i would like someone test this with qpel (as most code changes
affect qpel vectors).  It runs fine for me, but  i prefer someone double
check my patch.

-- 
Edouard Gomez-------------- next part --------------
--- orig/src/decoder.c
+++ mod/src/decoder.c
@@ -266,6 +266,7 @@
 	uint32_t i;
 	uint32_t iQuant = pMB->quant;
 	uint8_t *pY_Cur, *pU_Cur, *pV_Cur;
+	quant_intraFuncPtr dequant = (dec->quant_type == 0) ? dequant_intra : dequant4_intra;
 
 	if (reduced_resolution) {
 		pY_Cur = dec->cur.y + (y_pos << 5) * stride + (x_pos << 5);
@@ -326,11 +327,7 @@
 		stop_prediction_timer();
 
 		start_timer();
-		if (dec->quant_type == 0) {
-			dequant_intra(&data[i * 64], &block[i * 64], iQuant, iDcScaler);
-		} else {
-			dequant4_intra(&data[i * 64], &block[i * 64], iQuant, iDcScaler);
-		}
+		dequant(&data[i * 64], &block[i * 64], iQuant, iDcScaler);
 		stop_iquant_timer();
 
 		start_timer();
@@ -395,6 +392,8 @@
 
 	int uv_dx, uv_dy;
 	VECTOR mv[4];	/* local copy of mvs */
+	dequanth263_interFuncPtr dequant = (dec->quant_type == 0) ? dequant_inter : dequant4_inter;
+	const int direction = dec->alternate_vertical_scan ? 2 : 0;
 
 	if (reduced_resolution) {
 		pY_Cur = dec->cur.y + (y_pos << 5) * stride + (x_pos << 5);
@@ -414,8 +413,8 @@
 	
 	if (pMB->mode == MODE_INTER || pMB->mode == MODE_INTER_Q) {
 
-		uv_dx = mv[0].x / (1 + dec->quarterpel);
-		uv_dy = mv[0].y / (1 + dec->quarterpel);
+		uv_dx = mv[0].x >> dec->quarterpel;
+		uv_dy = mv[0].y >> dec->quarterpel;
 		
 		uv_dx = (uv_dx >> 1) + roundtab_79[uv_dx & 0x3];
 		uv_dy = (uv_dy >> 1) + roundtab_79[uv_dy & 0x3];
@@ -453,18 +452,12 @@
 	} else {	/* MODE_INTER4V */
 		int sum;
 		
-		if(dec->quarterpel)
-			sum = (mv[0].x / 2) + (mv[1].x / 2) + (mv[2].x / 2) + (mv[3].x / 2);
-		else
-			sum = mv[0].x + mv[1].x + mv[2].x + mv[3].x;
-
+		sum = mv[0].x + mv[1].x + mv[2].x + mv[3].x;
+		sum >>= dec->quarterpel;
 		uv_dx = (sum >> 3) + roundtab_76[sum & 0xf];
 
-		if(dec->quarterpel)
-			sum = (mv[0].y / 2) + (mv[1].y / 2) + (mv[2].y / 2) + (mv[3].y / 2);
-		else
-			sum = mv[0].y + mv[1].y + mv[2].y + mv[3].y;
-
+		sum = mv[0].y + mv[1].y + mv[2].y + mv[3].y;
+		sum >>= dec->quarterpel;
 		uv_dy = (sum >> 3) + roundtab_76[sum & 0xf];
 
 		start_timer();
@@ -521,10 +514,9 @@
 	}
 
 	for (i = 0; i < 6; i++) {
-		int direction = dec->alternate_vertical_scan ? 2 : 0;
 
-		if (cbp & (1 << (5 - i)))	/* coded */
-		{
+		/* coded */
+		if (cbp & (1 << (5 - i))) {
 			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	/* clear */
 
 			start_timer();
@@ -532,11 +524,7 @@
 			stop_coding_timer();
 
 			start_timer();
-			if (dec->quant_type == 0) {
-				dequant_inter(&data[i * 64], &block[i * 64], iQuant);
-			} else {
-				dequant4_inter(&data[i * 64], &block[i * 64], iQuant);
-			}
+			dequant(&data[i * 64], &block[i * 64], iQuant);
 			stop_iquant_timer();
 
 			start_timer();
@@ -609,6 +597,7 @@
 	uint8_t *const pY_Cur=dec->cur.y + (y_pos << 4) * stride + (x_pos << 4);
 	uint8_t *const pU_Cur=dec->cur.u + (y_pos << 3) * stride2 + (x_pos << 3);
 	uint8_t *const pV_Cur=dec->cur.v + (y_pos << 3) * stride2 + (x_pos << 3);
+	dequanth263_interFuncPtr dequant = (dec->quant_type == 0) ? dequant_inter : dequant4_inter;
 
 	pMB->mvs[0] = pMB->mvs[1] = pMB->mvs[2] = pMB->mvs[3] = pMB->amv;
 
@@ -648,8 +637,8 @@
 	for (i = 0; i < 6; i++) {
 		int direction = dec->alternate_vertical_scan ? 2 : 0;
 
-		if (cbp & (1 << (5 - i)))	/* coded */
-		{
+		/* coded */
+		if (cbp & (1 << (5 - i))) {
 			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	/* clear */
 
 			start_timer();
@@ -657,11 +646,7 @@
 			stop_coding_timer();
 
 			start_timer();
-			if (dec->quant_type == 0) {
-				dequant_inter(&data[i * 64], &block[i * 64], iQuant);
-			} else {
-				dequant4_inter(&data[i * 64], &block[i * 64], iQuant);
-			}
+			dequant(&data[i * 64], &block[i * 64], iQuant);
 			stop_iquant_timer();
 
 			start_timer();
@@ -1136,6 +1121,8 @@
 	uint32_t iQuant = pMB->quant;
 	uint8_t *pY_Cur, *pU_Cur, *pV_Cur;
 	int uv_dx, uv_dy;
+	const int direction = dec->alternate_vertical_scan ? 2 : 0;
+	dequanth263_interFuncPtr dequant = (dec->quant_type == 0) ? dequant_inter : dequant4_inter;
 
 	pY_Cur = dec->cur.y + (y_pos << 4) * stride + (x_pos << 4);
 	pU_Cur = dec->cur.u + (y_pos << 3) * stride2 + (x_pos << 3);
@@ -1146,29 +1133,20 @@
 		uv_dx = pMB->mvs[0].x;
 		uv_dy = pMB->mvs[0].y;
 
-		if (dec->quarterpel)
-		{
-			uv_dx /= 2;
-			uv_dy /= 2;
-		}
+		uv_dx >>= dec->quarterpel;
+		uv_dy >>= dec->quarterpel;
 
 		uv_dx = (uv_dx >> 1) + roundtab_79[uv_dx & 0x3];
 		uv_dy = (uv_dy >> 1) + roundtab_79[uv_dy & 0x3];
 	} else {
 		int sum;
 
-		if(dec->quarterpel)
-			sum = (pMB->mvs[0].x / 2) + (pMB->mvs[1].x / 2) + (pMB->mvs[2].x / 2) + (pMB->mvs[3].x / 2);
-		else
-			sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
-
+		sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
+		sum >>= dec->quarterpel;
 		uv_dx = (sum >> 3) + roundtab_76[sum & 0xf];
 
-		if(dec->quarterpel)
-			sum = (pMB->mvs[0].y / 2) + (pMB->mvs[1].y / 2) + (pMB->mvs[2].y / 2) + (pMB->mvs[3].y / 2);
-		else
-			sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
-
+		sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
+		sum >>= dec->quarterpel;
 		uv_dy = (sum >> 3) + roundtab_76[sum & 0xf];
 	}
 
@@ -1177,8 +1155,7 @@
 		interpolate16x16_quarterpel(dec->cur.y, dec->refn[ref].y, dec->qtmp.y, dec->qtmp.y + 64,
  								    dec->qtmp.y + 128, 16*x_pos, 16*y_pos,
 								    pMB->mvs[0].x, pMB->mvs[0].y, stride, 0);
-	}
-	else {
+	} else {
 		interpolate8x8_switch(dec->cur.y, dec->refn[ref].y, 16*x_pos, 16*y_pos,
 							  pMB->mvs[0].x, pMB->mvs[0].y, stride, 0);
 		interpolate8x8_switch(dec->cur.y, dec->refn[ref].y, 16*x_pos + 8, 16*y_pos,
@@ -1196,10 +1173,9 @@
 	stop_comp_timer();
 
 	for (i = 0; i < 6; i++) {
-		int direction = dec->alternate_vertical_scan ? 2 : 0;
 
-		if (cbp & (1 << (5 - i)))	/* coded */
-		{
+		/* coded */
+		if (cbp & (1 << (5 - i))) {
 			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	/* clear */
 
 			start_timer();
@@ -1207,11 +1183,7 @@
 			stop_coding_timer();
 
 			start_timer();
-			if (dec->quant_type == 0) {
-				dequant_inter(&data[i * 64], &block[i * 64], iQuant);
-			} else {
-				dequant4_inter(&data[i * 64], &block[i * 64], iQuant);
-			}
+			dequant(&data[i * 64], &block[i * 64], iQuant);
 			stop_iquant_timer();
 
 			start_timer();
@@ -1264,6 +1236,8 @@
 	uint32_t i;
 	uint8_t *pY_Cur, *pU_Cur, *pV_Cur;
     const uint32_t cbp = pMB->cbp;
+	const int direction = dec->alternate_vertical_scan ? 2 : 0;
+	dequanth263_interFuncPtr dequant = (dec->quant_type == 0) ? dequant_inter : dequant4_inter;
 
 	pY_Cur = dec->cur.y + (y_pos << 4) * stride + (x_pos << 4);
 	pU_Cur = dec->cur.u + (y_pos << 3) * stride2 + (x_pos << 3);
@@ -1277,14 +1251,11 @@
 		b_uv_dx = pMB->b_mvs[0].x;
 		b_uv_dy = pMB->b_mvs[0].y;
 
-		if (dec->quarterpel)
-		{
-			uv_dx /= 2;
-			uv_dy /= 2;
+		uv_dx >>= dec->quarterpel;
+		uv_dy >>= dec->quarterpel;
 
-			b_uv_dx /= 2;
-			b_uv_dy /= 2;
-		}
+		b_uv_dx /= 2;
+		b_uv_dy /= 2;
 
 		uv_dx = (uv_dx >> 1) + roundtab_79[uv_dx & 0x3];
 		uv_dy = (uv_dy >> 1) + roundtab_79[uv_dy & 0x3];
@@ -1294,33 +1265,20 @@
 	} else {
 		int sum;
 
-		if(dec->quarterpel)
-			sum = (pMB->mvs[0].x / 2) + (pMB->mvs[1].x / 2) + (pMB->mvs[2].x / 2) + (pMB->mvs[3].x / 2);
-		else
-			sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
-
+		sum = pMB->mvs[0].x + pMB->mvs[1].x + pMB->mvs[2].x + pMB->mvs[3].x;
+		sum >>= dec->quarterpel;
 		uv_dx = (sum >> 3) + roundtab_76[sum & 0xf];
 
-		if(dec->quarterpel)
-			sum = (pMB->mvs[0].y / 2) + (pMB->mvs[1].y / 2) + (pMB->mvs[2].y / 2) + (pMB->mvs[3].y / 2);
-		else
-			sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
-
+		sum = pMB->mvs[0].y + pMB->mvs[1].y + pMB->mvs[2].y + pMB->mvs[3].y;
+		sum >>= dec->quarterpel;
 		uv_dy = (sum >> 3) + roundtab_76[sum & 0xf];
 
-
-		if(dec->quarterpel)
-			sum = (pMB->b_mvs[0].x / 2) + (pMB->b_mvs[1].x / 2) + (pMB->b_mvs[2].x / 2) + (pMB->b_mvs[3].x / 2);
-		else
-			sum = pMB->b_mvs[0].x + pMB->b_mvs[1].x + pMB->b_mvs[2].x + pMB->b_mvs[3].x;
-
+		sum = pMB->b_mvs[0].x + pMB->b_mvs[1].x + pMB->b_mvs[2].x + pMB->b_mvs[3].x;
+		sum >>= dec->quarterpel;
 		b_uv_dx = (sum >> 3) + roundtab_76[sum & 0xf];
 
-		if(dec->quarterpel)
-			sum = (pMB->b_mvs[0].y / 2) + (pMB->b_mvs[1].y / 2) + (pMB->b_mvs[2].y / 2) + (pMB->b_mvs[3].y / 2);
-		else
-			sum = pMB->b_mvs[0].y + pMB->b_mvs[1].y + pMB->b_mvs[2].y + pMB->b_mvs[3].y;
-
+		sum = pMB->b_mvs[0].y + pMB->b_mvs[1].y + pMB->b_mvs[2].y + pMB->b_mvs[3].y;
+		sum >>= dec->quarterpel;
 		b_uv_dy = (sum >> 3) + roundtab_76[sum & 0xf];
 	}
 
@@ -1436,8 +1394,6 @@
 	stop_comp_timer();
 
 	for (i = 0; i < 6; i++) {
-		int direction = dec->alternate_vertical_scan ? 2 : 0;
-
 		if (cbp & (1 << (5 - i)))	/* coded */
 		{
 			memset(&block[i * 64], 0, 64 * sizeof(int16_t));	/* clear */
@@ -1447,11 +1403,7 @@
 			stop_coding_timer();
 
 			start_timer();
-			if (dec->quant_type == 0) {
-				dequant_inter(&data[i * 64], &block[i * 64], iQuant);
-			} else {
-				dequant4_inter(&data[i * 64], &block[i * 64], iQuant);
-			}
+			dequant(&data[i * 64], &block[i * 64], iQuant);
 			stop_iquant_timer();
 
 			start_timer();


More information about the XviD-devel mailing list