Also thanks for reviewing On Fri, Dec 2, 2022 at 10:34 PM Caleb Etemesi <etemesica...@gmail.com> wrote:
> Hi > > For bit-reading, > 1. The spec has some chunks read from back to front,I didn't see that > functionality present in get_bits.h(which I assumed contained the bit > reading API). > 2. It doesn't handle unstuffing correctly. > 3. Doesn't handle EOB correctly, the spec has some arrays when there are > no more bytes, the bit-reader should be filled with 0, others with 0xFF.. > > > >> These divisibility checks look like they could be even simpler given > that q always increases by 2 > > How so?(I might be failing to see something common here.) > > >> Could this be done all in one go in the loop above? Writing to mu then > reading from it later is not very cache friendly. > > Open to exploring that path, but to me it seems to add more complexity > than the speed up it may provide. > > Also I do believe it's really not that cache unfriendly, access is linear > on the second write and highly predictable. > > > On Fri, Dec 2, 2022 at 9:46 PM Tomas Härdin <g...@haerdin.se> wrote: > >> fre 2022-12-02 klockan 21:11 +0300 skrev etemesica...@gmail.com: >> > >> > +/** >> > + * Given a precomputed c, checks whether n % d == 0 >> > + */ >> > +static av_always_inline uint32_t is_divisible(uint32_t n, uint64_t >> > c) >> > +{ >> > + return n * c <= c - 1; >> > +} >> >> This looks like something that could go in lavu, say intmath.h >> >> > + >> > +/** >> > + * @brief Refill the buffer backwards in little Endian while >> > skipping >> > + * over stuffing bits >> > + * >> > + * Stuffing bits are those that appear in the position of any byte >> > whose >> > + * LSBs are all 1's if the last consumed byte was larger than 0x8F >> > + */ >> > +static int jpeg2000_bitbuf_refill_backwards(StateVars *buffer, >> > + const uint8_t *array) >> >> Don't we already have sufficient bitreading capabilities already? Maybe >> it doesn't do the necessary unstuffing.. >> >> >> > + /* >> > + * As an optimization, we can replace modulo operations with >> > + * checking if a number is divisible , since that's the only >> > thing we need. >> > + * this is paired with is_divisible. >> > + * Credits to Daniel Lemire blog post: >> > + * >> > >> https://lemire.me/blog/2019/02/08/faster-remainders-when-the-divisor-is-a-constant-beating-compilers-and-libdivide/ >> > + * It's UB on zero, but we can't have a quad being zero, the >> > spec doesn't allow, >> > + * so we error out early in case that's the case. >> > + */ >> > + >> > + c = 1 + UINT64_C(0xffffffffffffffff) / quad_width; >> >> Just 0xffffffffffffffffULL would work >> >> > + >> > + for (int row = 1; row < quad_height; row++) { >> > + while ((q - (row * quad_width)) < quad_width - 1 && q < >> > (quad_height * quad_width)) { >> > + q1 = q; >> > + q2 = q + 1; >> > + context1 = sigma_n[4 * (q1 - quad_width) + 1]; >> > + context1 += sigma_n[4 * (q1 - quad_width) + 3] << 2; // >> > ne >> > + >> > + if (!is_divisible(q1, c)) { >> >> These divisibility checks look like they could be even simpler given >> that q always increases by 2 >> >> > + context1 |= sigma_n[4 * (q1 - quad_width) - >> > 1]; // nw >> > + context1 += (sigma_n[4 * q1 - 1] | sigma_n[4 * q1 - >> > 2]) << 1; // sw| q >> > + } >> > + if (!is_divisible(q1 + 1, c)) >> > + context1 |= sigma_n[4 * (q1 - quad_width) + 5] << 2; >> > + >> > + if ((ret = jpeg2000_decode_sig_emb(s, mel_state, >> > mel_stream, vlc_stream, >> > + dec_CxtVLC_table1, >> > Dcup, sig_pat, res_off, >> > + emb_pat_k, emb_pat_1, >> > J2K_Q1, context1, Lcup, >> > + Pcup)) >> > + < 0) >> > + goto free; >> > + >> > + for (int i = 0; i < 4; i++) >> > + sigma_n[4 * q1 + i] = (sig_pat[J2K_Q1] >> i) & 1; >> > + >> > + context2 = sigma_n[4 * (q2 - quad_width) + 1]; >> > + context2 += sigma_n[4 * (q2 - quad_width) + 3] << 2; >> > + >> > + if (!is_divisible(q2, c)) { >> > + context2 |= sigma_n[4 * (q2 - quad_width) - 1]; >> > + context2 += (sigma_n[4 * q2 - 1] | sigma_n[4 * q2 - >> > 2]) << 1; >> > + } >> > + if (!is_divisible(q2 + 1, c)) >> > + context2 |= sigma_n[4 * (q2 - quad_width) + 5] << 2; >> > + >> > + if ((ret = jpeg2000_decode_sig_emb(s, mel_state, >> > mel_stream, vlc_stream, >> > + dec_CxtVLC_table1, >> > Dcup, sig_pat, res_off, >> > + emb_pat_k, emb_pat_1, >> > J2K_Q2, context2, Lcup, >> > + Pcup)) >> > + < 0) >> > + goto free; >> > + >> > + for (int i = 0; i < 4; i++) >> > + sigma_n[4 * q2 + i] = (sig_pat[J2K_Q2] >> i) & 1; >> > + >> > + u[J2K_Q1] = 0; >> > + u[J2K_Q2] = 0; >> > + >> > + jpeg2000_bitbuf_refill_backwards(vlc_stream, vlc_buf); >> > + >> > + if (res_off[J2K_Q1] == 1 && res_off[J2K_Q2] == 1) { >> > + u_pfx[J2K_Q1] = vlc_decode_u_prefix(vlc_stream, >> > vlc_buf); >> > + u_pfx[J2K_Q2] = vlc_decode_u_prefix(vlc_stream, >> > vlc_buf); >> > + >> > + u_sfx[J2K_Q1] = vlc_decode_u_suffix(vlc_stream, >> > u_pfx[J2K_Q1], vlc_buf); >> > + u_sfx[J2K_Q2] = vlc_decode_u_suffix(vlc_stream, >> > u_pfx[J2K_Q2], vlc_buf); >> > + >> > + u_ext[J2K_Q1] = vlc_decode_u_extension(vlc_stream, >> > u_sfx[J2K_Q1], vlc_buf); >> > + u_ext[J2K_Q2] = vlc_decode_u_extension(vlc_stream, >> > u_sfx[J2K_Q2], vlc_buf); >> > + >> > + u[J2K_Q1] = u_pfx[J2K_Q1] + u_sfx[J2K_Q1] + >> > (u_ext[J2K_Q1] << 2); >> > + u[J2K_Q2] = u_pfx[J2K_Q2] + u_sfx[J2K_Q2] + >> > (u_ext[J2K_Q2] << 2); >> > + >> > + } else if (res_off[J2K_Q1] == 1 || res_off[J2K_Q2] == 1) >> > { >> > + uint8_t pos = res_off[J2K_Q1] == 1 ? 0 : 1; >> > + >> > + u_pfx[pos] = vlc_decode_u_prefix(vlc_stream, >> > vlc_buf); >> > + u_sfx[pos] = vlc_decode_u_suffix(vlc_stream, >> > u_pfx[pos], vlc_buf); >> > + u_ext[pos] = vlc_decode_u_extension(vlc_stream, >> > u_sfx[pos], vlc_buf); >> > + >> > + u[pos] = u_pfx[pos] + u_sfx[pos] + (u_ext[pos] << >> > 2); >> > + } >> > + sp = sig_pat[J2K_Q1]; >> > + >> > + gamma[J2K_Q1] = 1; >> > + >> > + if (sp == 0 || sp == 1 || sp == 2 || sp == 4 || sp == 8) >> > + gamma[J2K_Q1] = 0; >> > + >> > + sp = sig_pat[J2K_Q2]; >> > + >> > + gamma[J2K_Q2] = 1; >> > + >> > + if (sp == 0 || sp == 1 || sp == 2 || sp == 4 || sp == 8) >> > + gamma[J2K_Q2] = 0; >> > + >> > + E_n[J2K_Q1] = E[4 * (q1 - quad_width) + 1]; >> > + E_n[J2K_Q2] = E[4 * (q2 - quad_width) + 1]; >> > + >> > + E_ne[J2K_Q1] = E[4 * (q1 - quad_width) + 3]; >> > + E_ne[J2K_Q2] = E[4 * (q2 - quad_width) + 3]; >> > + >> > + E_nw[J2K_Q1] = (!is_divisible(q1, c)) * E[FFMAX((4 * (q1 >> > - quad_width) - 1), 0)]; >> > + E_nw[J2K_Q2] = (!is_divisible(q2, c)) * E[FFMAX((4 * (q2 >> > - quad_width) - 1), 0)]; >> > + >> > + E_nf[J2K_Q1] = (!is_divisible(q1 + 1, c)) * E[4 * (q1 - >> > quad_width) + 5]; >> > + E_nf[J2K_Q2] = (!is_divisible(q2 + 1, c)) * E[4 * (q2 - >> > quad_width) + 5]; >> > + >> > + max_e[J2K_Q1] = FFMAX(E_nw[J2K_Q1], FFMAX3(E_n[J2K_Q1], >> > E_ne[J2K_Q1], E_nf[J2K_Q1])); >> > + max_e[J2K_Q2] = FFMAX(E_nw[J2K_Q2], FFMAX3(E_n[J2K_Q2], >> > E_ne[J2K_Q2], E_nf[J2K_Q2])); >> > + >> > + kappa[J2K_Q1] = FFMAX(1, gamma[J2K_Q1] * (max_e[J2K_Q1] >> > - 1)); >> > + kappa[J2K_Q2] = FFMAX(1, gamma[J2K_Q2] * (max_e[J2K_Q2] >> > - 1)); >> > + >> > + U[J2K_Q1] = kappa[J2K_Q1] + u[J2K_Q1]; >> > + U[J2K_Q2] = kappa[J2K_Q2] + u[J2K_Q2]; >> > + >> > + for (int i = 0; i < 4; i++) { >> > + m[J2K_Q1][i] = sigma_n[4 * q1 + i] * U[J2K_Q1] - >> > ((emb_pat_k[J2K_Q1] >> i) & 1); >> > + m[J2K_Q2][i] = sigma_n[4 * q2 + i] * U[J2K_Q2] - >> > ((emb_pat_k[J2K_Q2] >> i) & 1); >> > + } >> > + recover_mag_sgn(mag_sgn_stream, J2K_Q1, q1, m_n, >> > known_1, emb_pat_1, v, m, >> > + E, mu_n, Dcup, Pcup, pLSB); >> > + >> > + recover_mag_sgn(mag_sgn_stream, J2K_Q2, q2, m_n, >> > known_1, emb_pat_1, v, m, >> > + E, mu_n, Dcup, Pcup, pLSB); >> > + >> > + /* move to the next quad pair */ >> > + q += 2; >> > + } >> > + if (quad_width % 2 == 1) { >> > + q1 = q; >> > + /* calculate context for current quad */ >> > + context1 = sigma_n[4 * (q1 - quad_width) + 1]; >> > + context1 += (sigma_n[4 * (q1 - quad_width) + 3] << 2); >> > + >> > + if (!is_divisible(q1, c)) { >> > + context1 |= sigma_n[4 * (q1 - quad_width) - 1]; >> > + context1 += (sigma_n[4 * q1 - 1] | sigma_n[4 * q1 - >> > 2]) << 1; >> > + } >> > + if (!is_divisible(q1 + 1, c)) >> > + context1 |= sigma_n[4 * (q1 - quad_width) + 5] << 2; >> > + >> > + if ((ret = jpeg2000_decode_sig_emb(s, mel_state, >> > mel_stream, vlc_stream, >> > + dec_CxtVLC_table1, >> > Dcup, sig_pat, res_off, >> > + emb_pat_k, emb_pat_1, >> > J2K_Q1, context1, Lcup, >> > + Pcup)) >> > + < 0) >> > + goto free; >> > + >> > + for (int i = 0; i < 4; i++) >> > + sigma_n[4 * q1 + i] = (sig_pat[J2K_Q1] >> i) & 1; >> > + >> > + u[J2K_Q1] = 0; >> > + >> > + /* Recover mag_sgn value */ >> > + if (res_off[J2K_Q1] == 1) { >> > + u_pfx[J2K_Q1] = vlc_decode_u_prefix(vlc_stream, >> > vlc_buf); >> > + u_sfx[J2K_Q1] = vlc_decode_u_suffix(vlc_stream, >> > u_pfx[J2K_Q1], vlc_buf); >> > + u_ext[J2K_Q1] = vlc_decode_u_extension(vlc_stream, >> > u_sfx[J2K_Q1], vlc_buf); >> > + >> > + u[J2K_Q1] = u_pfx[J2K_Q1] + u_sfx[J2K_Q1] + >> > (u_ext[J2K_Q1] << 2); >> > + } >> > + >> > + sp = sig_pat[J2K_Q1]; >> > + >> > + gamma[J2K_Q1] = 1; >> > + >> > + if (sp == 0 || sp == 1 || sp == 2 || sp == 4 || sp == 8) >> > + gamma[J2K_Q1] = 0; >> > + >> > + E_n[J2K_Q1] = E[4 * (q1 - quad_width) + 1]; >> > + >> > + E_ne[J2K_Q1] = E[4 * (q1 - quad_width) + 3]; >> > + >> > + E_nw[J2K_Q1] = (!is_divisible(q1, c)) * E[FFMAX((4 * (q1 >> > - quad_width) - 1), 0)]; >> > + >> > + E_nf[J2K_Q1] = (!is_divisible(q1 + 1, c)) * E[4 * (q1 - >> > quad_width) + 5]; >> > + >> > + max_e[J2K_Q1] = FFMAX(E_nw[J2K_Q1], FFMAX3(E_n[J2K_Q1], >> > E_ne[J2K_Q1], E_nf[J2K_Q1])); >> > + >> > + kappa[J2K_Q1] = FFMAX(1, gamma[J2K_Q1] * (max_e[J2K_Q1] >> > - 1)); >> > + >> > + U[J2K_Q1] = kappa[J2K_Q1] + u[J2K_Q1]; >> > + >> > + for (int i = 0; i < 4; i++) >> > + m[J2K_Q1][i] = sigma_n[4 * q1 + i] * U[J2K_Q1] - >> > ((emb_pat_k[J2K_Q1] >> i) & 1); >> > + >> > + recover_mag_sgn(mag_sgn_stream, J2K_Q1, q1, m_n, >> > known_1, emb_pat_1, v, m, >> > + E, mu_n, Dcup, Pcup, pLSB); >> > + q += 1; >> > + } >> > + } >> > + // convert to raster-scan >> >> Could this be done all in one go in the loop above? Writing to mu then >> reading from it later is not very cache friendly. >> >> No further comments for now. Might re-read it later. >> >> /Tomas >> >> _______________________________________________ >> ffmpeg-devel mailing list >> ffmpeg-devel@ffmpeg.org >> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel >> >> To unsubscribe, visit link above, or email >> ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". >> > _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".