diff --git a/common/common.c b/common/common.c index 71a29b1..301b9ed 100644 --- a/common/common.c +++ b/common/common.c @@ -117,6 +117,7 @@ void x264_param_default( x264_param_t *param ) | X264_ANALYSE_PSUB16x16 | X264_ANALYSE_BSUB16x16; param->analyse.i_direct_mv_pred = X264_DIRECT_PRED_SPATIAL; param->analyse.i_me_method = X264_ME_HEX; + param->analyse.f_psy_rd = 1.0; param->analyse.i_me_range = 16; param->analyse.i_subpel_refine = 5; param->analyse.b_chroma_me = 1; @@ -464,6 +465,8 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value ) p->analyse.i_mv_range_thread = atoi(value); OPT2("subme", "subq") p->analyse.i_subpel_refine = atoi(value); + OPT("psy-rd") + p->analyse.f_psy_rd = atof(value); OPT("bime") p->analyse.b_bidir_me = atobool(value); OPT("chroma-me") @@ -856,6 +859,7 @@ char *x264_param2string( x264_param_t *p, int b_res ) s += sprintf( s, " analyse=%#x:%#x", p->analyse.intra, p->analyse.inter ); s += sprintf( s, " me=%s", x264_motion_est_names[ p->analyse.i_me_method ] ); s += sprintf( s, " subme=%d", p->analyse.i_subpel_refine ); + s += sprintf( s, " psy_rd=%f", p->analyse.f_psy_rd ); s += sprintf( s, " brdo=%d", p->analyse.b_bframe_rdo ); s += sprintf( s, " mixed_ref=%d", p->analyse.b_mixed_references ); s += sprintf( s, " me_range=%d", p->analyse.i_me_range ); diff --git a/common/common.h b/common/common.h index e2792cc..fbd88fd 100644 --- a/common/common.h +++ b/common/common.h @@ -454,6 +454,12 @@ struct x264_t DECLARE_ALIGNED_16( int16_t i8x8_dct_buf[3][64] ); DECLARE_ALIGNED_16( int16_t i4x4_dct_buf[15][16] ); + /* SATD scores for psy RD */ + int fenc_satd[4][4]; + int fenc_satd_sum; + int fenc_sa8d[2][2]; + int fenc_sa8d_sum; + /* pointer over mb of the frame to be compressed */ uint8_t *p_fenc[3]; diff --git a/encoder/analyse.c b/encoder/analyse.c index 270b90a..25346b4 100644 --- a/encoder/analyse.c +++ b/encoder/analyse.c @@ -467,6 +467,34 @@ static void predict_4x4_mode_available( unsigned int i_neighbour, } } +/* Pre-calculate fenc satd scores for psy RD, minus DC coefficients */ +static inline void x264_mb_cache_fenc_satd( x264_t *h ) +{ + DECLARE_ALIGNED_16(uint8_t zero[16]) = {0}; + uint8_t *fenc; + int x, y, satd_sum = 0, sa8d_sum = 0; + if( !h->param.analyse.i_psy_rd) + return; + for( y = 0; y < 4; y++ ) + for( x = 0; x < 4; x++ ) + { + fenc = h->mb.pic.p_fenc[0]+x*4+y*4*FENC_STRIDE; + h->mb.pic.fenc_satd[y][x] = h->pixf.satd[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE ) + - (h->pixf.sad[PIXEL_4x4]( zero, 0, fenc, FENC_STRIDE )>>1); + satd_sum += h->mb.pic.fenc_satd[y][x]; + } + for( y = 0; y < 2; y++ ) + for( x = 0; x < 2; x++ ) + { + fenc = h->mb.pic.p_fenc[0]+x*8+y*8*FENC_STRIDE; + h->mb.pic.fenc_sa8d[y][x] = h->pixf.sa8d[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE ) + - (h->pixf.sad[PIXEL_8x8]( zero, 0, fenc, FENC_STRIDE )>>2); + sa8d_sum += h->mb.pic.fenc_sa8d[y][x]; + } + h->mb.pic.fenc_satd_sum = satd_sum; + h->mb.pic.fenc_sa8d_sum = sa8d_sum; +} + static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a ) { int i; @@ -1016,12 +1044,15 @@ static void x264_mb_analyse_inter_p16x16( x264_t *h, x264_mb_analysis_t *a ) assert( a->l0.me16x16.mv[1] <= h->mb.mv_max_spel[1] || h->param.i_threads == 1 ); h->mb.i_type = P_L0; - if( a->b_mbrd && a->l0.me16x16.i_ref == 0 - && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv ) + if( a->b_mbrd ) { - h->mb.i_partition = D_16x16; - x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); - a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 ); + x264_mb_cache_fenc_satd( h ); + if( a->l0.me16x16.i_ref == 0 && *(uint32_t*)a->l0.me16x16.mv == *(uint32_t*)h->mb.cache.pskip_mv ) + { + h->mb.i_partition = D_16x16; + x264_macroblock_cache_mv_ptr( h, 0, 0, 4, 4, 0, a->l0.me16x16.mv ); + a->l0.i_rd16x16 = x264_rd_cost_mb( h, a->i_lambda2 ); + } } } @@ -1906,7 +1937,7 @@ static void x264_mb_analyse_p_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd ) static void x264_mb_analyse_b_rd( x264_t *h, x264_mb_analysis_t *a, int i_satd_inter ) { - int thresh = i_satd_inter * 17/16; + int thresh = i_satd_inter * (17 + (!!h->param.analyse.i_psy_rd))/16; if( a->b_direct_available && a->i_rd16x16direct == COST_MAX ) { @@ -2067,7 +2098,10 @@ void x264_macroblock_analyse( x264_t *h ) { x264_mb_analyse_intra( h, &analysis, COST_MAX ); if( analysis.b_mbrd ) + { + x264_mb_cache_fenc_satd( h ); x264_intra_rd( h, &analysis, COST_MAX ); + } i_cost = analysis.i_satd_i16x16; h->mb.i_type = I_16x16; @@ -2342,6 +2376,9 @@ void x264_macroblock_analyse( x264_t *h ) { int i_bskip_cost = COST_MAX; int b_skip = 0; + + if( analysis.b_mbrd ) + x264_mb_cache_fenc_satd( h ); h->mb.i_type = B_SKIP; if( h->mb.b_direct_auto_write ) diff --git a/encoder/encoder.c b/encoder/encoder.c index 2c2fe8c..ff5febe 100644 --- a/encoder/encoder.c +++ b/encoder/encoder.c @@ -411,5 +411,6 @@ static int x264_validate_parameters( x264_t *h ) h->param.analyse.b_fast_pskip = 0; h->param.analyse.i_noise_reduction = 0; + h->param.analyse.f_psy_rd = 0; } if( h->param.rc.i_rc_method == X264_RC_CQP ) { @@ -486,6 +487,16 @@ static int x264_validate_parameters( x264_t *h ) if( !h->param.b_cabac ) h->param.analyse.i_trellis = 0; h->param.analyse.i_trellis = x264_clip3( h->param.analyse.i_trellis, 0, 2 ); + h->param.analyse.f_psy_rd = x264_clip3f( h->param.analyse.f_psy_rd, 0, 10 ); + if( h->param.analyse.i_subpel_refine < 6) + h->param.analyse.f_psy_rd = 0; + if( h->param.analyse.f_psy_rd && h->param.analyse.i_trellis == 1 ) + { + x264_log( h, X264_LOG_WARNING, "psy RD is not compatible with trellis=1; use 0 or 2.\n" ); + h->param.analyse.i_trellis = 0; + } + if( h->param.analyse.f_psy_rd ) + h->param.analyse.i_psy_rd = FIX8( h->param.analyse.f_psy_rd ); h->param.rc.i_aq_mode = x264_clip3( h->param.rc.i_aq_mode, 0, 2 ); if( h->param.rc.f_aq_strength <= 0 ) h->param.rc.i_aq_mode = 0; diff --git a/encoder/rdo.c b/encoder/rdo.c index 76bf57b..7da862a 100644 --- a/encoder/rdo.c +++ b/encoder/rdo.c @@ -50,21 +50,82 @@ static uint16_t cabac_prefix_size[15][128]; #define COPY_CABAC h->mc.memcpy_aligned( &cabac_tmp.f8_bits_encoded, &h->cabac.f8_bits_encoded, \ sizeof(x264_cabac_t) - offsetof(x264_cabac_t,f8_bits_encoded) ) - -static int ssd_mb( x264_t *h ) + +#define ADD_ABS_SATD(satdtype, pixel)\ + satd += abs((h->pixf.satdtype[pixel]( zero, 0, fdec, FDEC_STRIDE ) - dc_coef)\ + - sum_##satdtype( h, pixel, x, y )); + +/* Sum the cached SATDs to avoid repeating them. */ +static inline int sum_satd( x264_t *h, int pixel, int x, int y ) +{ + int satd = 0; + int min_x = x>>2; + int min_y = y>>2; + int max_x = (x>>2) + (x264_pixel_size[pixel].w>>2); + int max_y = (y>>2) + (x264_pixel_size[pixel].h>>2); + if( pixel == PIXEL_16x16 ) + return h->mb.pic.fenc_satd_sum; + for( y = min_y; y < max_y; y++ ) + for( x = min_x; x < max_x; x++ ) + satd += h->mb.pic.fenc_satd[y][x]; + return satd; +} + +static inline int sum_sa8d( x264_t *h, int pixel, int x, int y ) +{ + int sa8d = 0; + int min_x = x>>3; + int min_y = y>>3; + int max_x = (x>>3) + (x264_pixel_size[pixel].w>>3); + int max_y = (y>>3) + (x264_pixel_size[pixel].h>>3); + if( pixel == PIXEL_16x16 ) + return h->mb.pic.fenc_sa8d_sum; + for( y = min_y; y < max_y; y++ ) + for( x = min_x; x < max_x; x++ ) + sa8d += h->mb.pic.fenc_sa8d[y][x]; + return sa8d; +} + +/* Psy RD distortion metric: SSD plus "Absolute Difference of Complexities" */ +/* SATD and SA8D are used to measure block complexity. */ +/* Blocks with a complexity most similar to that of the source are scored best. */ +/* The difference between SATD and SA8D scores are both used to avoid bias from the DCT size. Using SATD */ +/* only, for example, results in overusage of 8x8dct, while the opposite occurs when using SA8D. */ +/* This is because frequencies stored in an 8x8dct sum up to a larger value when viewed through a 4x4 */ +/* transform and vice versa with a 4x4dct and an 8x8 transform. */ +/* SSD is still used as the primary RD metric; this value is merely added to it for psy purposes. */ + +/* FIXME: Is there a better metric than averaged SATD/SA8D difference for complexity difference? */ +/* Hadamard transform is recursive, so a SATD+SA8D can be done faster by taking advantage of this fact. */ +/* This optimization can also be used in non-RD transform decision. */ + +static inline int ssd_plane( x264_t *h, int size, int p, int x, int y ) { - return h->pixf.ssd[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, - h->mb.pic.p_fdec[0], FDEC_STRIDE ) - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, - h->mb.pic.p_fdec[1], FDEC_STRIDE ) - + h->pixf.ssd[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, - h->mb.pic.p_fdec[2], FDEC_STRIDE ); + DECLARE_ALIGNED_16(uint8_t zero[16]) = {0}; + int satd = 0; + uint8_t *fdec = h->mb.pic.p_fdec[p] + x + y*FDEC_STRIDE; + uint8_t *fenc = h->mb.pic.p_fenc[p] + x + y*FENC_STRIDE; + if( p == 0 && h->param.analyse.i_psy_rd ) + { + int dc_coef = h->pixf.sad[size](zero, 0, fdec, FDEC_STRIDE) >> 1; + ADD_ABS_SATD(satd, size); + /* If the plane is smaller than 8x8, we can't do an SA8D; this probably isn't a big problem. */ + if(size <= PIXEL_8x8) + { + dc_coef >>= 1; + ADD_ABS_SATD(sa8d, size); + satd >>= 1; + } + satd = (satd * h->param.analyse.i_psy_rd * x264_lambda_tab[h->mb.i_qp] + 128) >> 8; + } + return h->pixf.ssd[size](fenc, FENC_STRIDE, fdec, FDEC_STRIDE) + satd; } -static int ssd_plane( x264_t *h, int size, int p, int x, int y ) +static inline int ssd_mb( x264_t *h ) { - return h->pixf.ssd[size]( h->mb.pic.p_fenc[p] + x+y*FENC_STRIDE, FENC_STRIDE, - h->mb.pic.p_fdec[p] + x+y*FDEC_STRIDE, FDEC_STRIDE ); + return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + + ssd_plane(h, PIXEL_8x8, 1, 0, 0) + + ssd_plane(h, PIXEL_8x8, 2, 0, 0); } static int x264_rd_cost_mb( x264_t *h, int i_lambda2 ) diff --git a/x264.c b/x264.c index 14466e5..96326cd 100644 --- a/x264.c +++ b/x264.c @@ -243,6 +243,9 @@ static void Help( x264_param_t *defaults, int b_longhelp ) H0( " -m, --subme Subpixel motion estimation and partition\n" " decision quality: 1=fast, 7=best. [%d]\n", defaults->analyse.i_subpel_refine ); H0( " --b-rdo RD based mode decision for B-frames. Requires subme 6.\n" ); + H0( " --psy-rd Strength of mode decision psychovisual optimization [\"%f\"]\n" + " Does nothing at subme < 6.", + defaults->analyse.f_psy_rd ); H0( " --mixed-refs Decide references on a per partition basis\n" ); H1( " --no-chroma-me Ignore chroma in motion estimation\n" ); H1( " --bime Jointly optimize both MVs in B-frames\n" ); @@ -411,6 +414,7 @@ static int Parse( int argc, char **argv, { "mvrange", required_argument, NULL, 0 }, { "mvrange-thread", required_argument, NULL, 0 }, { "subme", required_argument, NULL, 'm' }, + { "psy-rd", required_argument, NULL, 0 }, { "b-rdo", no_argument, NULL, 0 }, { "mixed-refs", no_argument, NULL, 0 }, { "no-chroma-me", no_argument, NULL, 0 }, diff --git a/x264.h b/x264.h index 3b678dc..02266c1 100644 --- a/x264.h +++ b/x264.h @@ -239,6 +239,8 @@ typedef struct x264_param_t int b_fast_pskip; /* early SKIP detection on P-frames */ int b_dct_decimate; /* transform coefficient thresholding on P-frames */ int i_noise_reduction; /* adaptive pseudo-deadzone */ + float f_psy_rd; /* Psy RD strength */ + int i_psy_rd; /* Psy RD strength--fixed point value*/ /* the deadzone size that will be used in luma quantization */ int i_luma_deadzone[2]; /* {inter, intra} */