Spaces:
Runtime error
Runtime error
File size: 6,672 Bytes
b86f76f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
/*
* Copyright (c) 2017 Meng Wang <wangmeng.kids@bytedance.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavutil/arm/asm.S"
#include "neon.S"
function ff_hevc_sao_band_filter_neon_8, export=1
push {r4-r10}
ldr r5, [sp, #28] // width
ldr r4, [sp, #32] // height
ldr r8, [sp, #36] // offset_table
vpush {d8-d15}
mov r12, r4 // r12 = height
mov r6, r0 // r6 = r0 = dst
mov r7, r1 // r7 = r1 = src
vldm r8, {q0-q3}
vmov.u16 q15, #1
vmov.u8 q14, #32
0: pld [r1]
cmp r5, #4
beq 4f
8: subs r4, #1
vld1.8 {d16}, [r1], r3
vshr.u8 d17, d16, #3 // index = [src>>3]
vshll.u8 q9, d17, #1 // lowIndex = 2*index
vadd.u16 q11, q9, q15 // highIndex = (2*index+1) << 8
vshl.u16 q10, q11, #8 // q10: highIndex; q9: lowIndex;
vadd.u16 q10, q9 // combine high and low index;
// Look-up Table Round 1; index range: 0-15
vtbx.8 d24, {q0-q1}, d20
vtbx.8 d25, {q0-q1}, d21
// Look-up Table Round 2; index range: 16-31
vsub.u8 q10, q14 // Look-up with 8bit
vtbx.8 d24, {q2-q3}, d20
vtbx.8 d25, {q2-q3}, d21
vaddw.u8 q13, q12, d16
vqmovun.s16 d8, q13
vst1.8 d8, [r0], r2
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #8
mov r0, r6
add r7, #8
mov r1, r7
b 0b
4: subs r4, #1
vld1.32 {d16[0]}, [r1], r3
vshr.u8 d17, d16, #3 // src>>3
vshll.u8 q9, d17, #1 // lowIndex = 2*index
vadd.u16 q11, q9, q15 // highIndex = (2*index+1) << 8
vshl.u16 q10, q11, #8 // q10: highIndex; q9: lowIndex;
vadd.u16 q10, q9 // combine high and low index;
// Look-up Table Round 1; index range: 0-15
vtbx.8 d24, {q0-q1}, d20
vtbx.8 d25, {q0-q1}, d21
// Look-up Table Round 2; index range: 16-32
vsub.u8 q10, q14 // Look-up with 8bit
vtbx.8 d24, {q2-q3}, d20
vtbx.8 d25, {q2-q3}, d21
vaddw.u8 q13, q12, d16
vqmovun.s16 d14, q13
vst1.32 d14[0], [r0], r2
bne 4b
b 99f
99:
vpop {d8-d15}
pop {r4-r10}
bx lr
endfunc
function ff_hevc_sao_edge_filter_neon_8, export=1
push {r4-r11}
ldr r5, [sp, #32] // width
ldr r4, [sp, #36] // height
ldr r8, [sp, #40] // a_stride
ldr r9, [sp, #44] // b_stride
ldr r10, [sp, #48] // sao_offset_val
ldr r11, [sp, #52] // edge_idx
vpush {d8-d15}
mov r12, r4 // r12 = height
mov r6, r0 // r6 = r0 = dst
mov r7, r1 // r7 = r1 = src
vld1.8 {d0}, [r11] // edge_idx tabel load in d0 5x8bit
vld1.16 {q1}, [r10] // sao_offset_val table load in q1, 5x16bit
vmov.u8 d1, #2
vmov.u16 q2, #1
0: mov r10, r1
add r10, r8 // src[x + a_stride]
mov r11, r1
add r11, r9 // src[x + b_stride]
pld [r1]
cmp r5, #4
beq 4f
8: subs r4, #1
vld1.8 {d16}, [r1], r3 // src[x] 8x8bit
vld1.8 {d17}, [r10], r3 // src[x + a_stride]
vld1.8 {d18}, [r11], r3 // src[x + b_stride]
vcgt.u8 d8, d16, d17
vshr.u8 d9, d8, #7
vclt.u8 d8, d16, d17
vadd.u8 d8, d9 // diff0
vcgt.u8 d10, d16, d18
vshr.u8 d11, d10, #7
vclt.u8 d10, d16, d18
vadd.u8 d10, d11 // diff1
vadd.s8 d8, d10
vadd.s8 d8, d1
vtbx.8 d9, {d0}, d8 // offset_val
vshll.u8 q6, d9, #1 // lowIndex
vadd.u16 q7, q6, q2
vshl.u16 q10, q7, #8 // highIndex
vadd.u16 q10, q6 // combine lowIndex and highIndex, offset_val
vtbx.8 d22, {q1}, d20
vtbx.8 d23, {q1}, d21
vaddw.u8 q12, q11, d16
vqmovun.s16 d26, q12
vst1.8 d26, [r0], r2
bne 8b
subs r5, #8
beq 99f
mov r4, r12
add r6, #8
mov r0, r6
add r7, #8
mov r1, r7
b 0b
4: subs r4, #1
vld1.32 {d16[0]}, [r1], r3
vld1.32 {d17[0]}, [r10], r3 // src[x + a_stride]
vld1.32 {d18[0]}, [r11], r3 // src[x + b_stride]
vcgt.u8 d8, d16, d17
vshr.u8 d9, d8, #7
vclt.u8 d8, d16, d17
vadd.u8 d8, d9 // diff0
vcgt.u8 d10, d16, d18
vshr.u8 d11, d10, #7
vclt.u8 d10, d16, d18
vadd.u8 d10, d11 // diff1
vadd.s8 d8, d10
vadd.s8 d8, d1
vtbx.8 d9, {d0}, d8 // offset_val
vshll.u8 q6, d9, #1 // lowIndex
vadd.u16 q7, q6, q2
vshl.u16 q10, q7, #8 // highIndex
vadd.u16 q10, q6 // combine lowIndex and highIndex, offset_val
vtbx.8 d22, {q1}, d20
vtbx.8 d23, {q1}, d21
vaddw.u8 q12, q11, d16
vqmovun.s16 d26, q12
vst1.32 d26[0], [r0], r2
bne 4b
b 99f
99:
vpop {d8-d15}
pop {r4-r11}
bx lr
endfunc
|