📄 dsputil_iwmmxt_rnd.h
字号:
/* * iWMMXt optimized DSP utils * copyright (c) 2004 AGAWA Koji * * This file is part of FFmpeg. * * FFmpeg is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * FFmpeg is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with FFmpeg; if not, write to the Free Software * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA *//* This header intentionally has no multiple inclusion guards. It is meant to * be included multiple times and generates different code depending on the * value of certain #defines. */void DEF(put, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h){ int stride = line_size; asm volatile ( "and r12, %[pixels], #7 \n\t" "bic %[pixels], %[pixels], #7 \n\t" "tmcr wcgr1, r12 \n\t" "add r4, %[pixels], %[line_size] \n\t" "add r5, %[block], %[line_size] \n\t" "mov %[line_size], %[line_size], lsl #1 \n\t" "1: \n\t" "wldrd wr0, [%[pixels]] \n\t" "subs %[h], %[h], #2 \n\t" "wldrd wr1, [%[pixels], #8] \n\t" "add %[pixels], %[pixels], %[line_size] \n\t" "wldrd wr3, [r4] \n\t" "pld [%[pixels]] \n\t" "pld [%[pixels], #32] \n\t" "wldrd wr4, [r4, #8] \n\t" "add r4, r4, %[line_size] \n\t" "walignr1 wr8, wr0, wr1 \n\t" "pld [r4] \n\t" "pld [r4, #32] \n\t" "walignr1 wr10, wr3, wr4 \n\t" "wstrd wr8, [%[block]] \n\t" "add %[block], %[block], %[line_size] \n\t" "wstrd wr10, [r5] \n\t" "add r5, r5, %[line_size] \n\t" "bne 1b \n\t" : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) : : "memory", "r4", "r5", "r12");}void DEF(avg, pixels8)(uint8_t *block, const uint8_t *pixels, const int line_size, int h){ int stride = line_size; asm volatile ( "and r12, %[pixels], #7 \n\t" "bic %[pixels], %[pixels], #7 \n\t" "tmcr wcgr1, r12 \n\t" "add r4, %[pixels], %[line_size] \n\t" "add r5, %[block], %[line_size] \n\t" "mov %[line_size], %[line_size], lsl #1 \n\t" "1: \n\t" "wldrd wr0, [%[pixels]] \n\t" "subs %[h], %[h], #2 \n\t" "wldrd wr1, [%[pixels], #8] \n\t" "add %[pixels], %[pixels], %[line_size] \n\t" "wldrd wr3, [r4] \n\t" "pld [%[pixels]] \n\t" "pld [%[pixels], #32] \n\t" "wldrd wr4, [r4, #8] \n\t" "add r4, r4, %[line_size] \n\t" "walignr1 wr8, wr0, wr1 \n\t" "wldrd wr0, [%[block]] \n\t" "wldrd wr2, [r5] \n\t" "pld [r4] \n\t" "pld [r4, #32] \n\t" "walignr1 wr10, wr3, wr4 \n\t" WAVG2B" wr8, wr8, wr0 \n\t" WAVG2B" wr10, wr10, wr2 \n\t" "wstrd wr8, [%[block]] \n\t" "add %[block], %[block], %[line_size] \n\t" "wstrd wr10, [r5] \n\t" "pld [%[block]] \n\t" "pld [%[block], #32] \n\t" "add r5, r5, %[line_size] \n\t" "pld [r5] \n\t" "pld [r5, #32] \n\t" "bne 1b \n\t" : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) : : "memory", "r4", "r5", "r12");}void DEF(put, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h){ int stride = line_size; asm volatile ( "and r12, %[pixels], #7 \n\t" "bic %[pixels], %[pixels], #7 \n\t" "tmcr wcgr1, r12 \n\t" "add r4, %[pixels], %[line_size] \n\t" "add r5, %[block], %[line_size] \n\t" "mov %[line_size], %[line_size], lsl #1 \n\t" "1: \n\t" "wldrd wr0, [%[pixels]] \n\t" "wldrd wr1, [%[pixels], #8] \n\t" "subs %[h], %[h], #2 \n\t" "wldrd wr2, [%[pixels], #16] \n\t" "add %[pixels], %[pixels], %[line_size] \n\t" "wldrd wr3, [r4] \n\t" "pld [%[pixels]] \n\t" "pld [%[pixels], #32] \n\t" "walignr1 wr8, wr0, wr1 \n\t" "wldrd wr4, [r4, #8] \n\t" "walignr1 wr9, wr1, wr2 \n\t" "wldrd wr5, [r4, #16] \n\t" "add r4, r4, %[line_size] \n\t" "pld [r4] \n\t" "pld [r4, #32] \n\t" "walignr1 wr10, wr3, wr4 \n\t" "wstrd wr8, [%[block]] \n\t" "walignr1 wr11, wr4, wr5 \n\t" "wstrd wr9, [%[block], #8] \n\t" "add %[block], %[block], %[line_size] \n\t" "wstrd wr10, [r5] \n\t" "wstrd wr11, [r5, #8] \n\t" "add r5, r5, %[line_size] \n\t" "bne 1b \n\t" : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) : : "memory", "r4", "r5", "r12");}void DEF(avg, pixels16)(uint8_t *block, const uint8_t *pixels, const int line_size, int h){ int stride = line_size; asm volatile ( "pld [%[pixels]] \n\t" "pld [%[pixels], #32] \n\t" "pld [%[block]] \n\t" "pld [%[block], #32] \n\t" "and r12, %[pixels], #7 \n\t" "bic %[pixels], %[pixels], #7 \n\t" "tmcr wcgr1, r12 \n\t" "add r4, %[pixels], %[line_size]\n\t" "add r5, %[block], %[line_size] \n\t" "mov %[line_size], %[line_size], lsl #1 \n\t" "1: \n\t" "wldrd wr0, [%[pixels]] \n\t" "wldrd wr1, [%[pixels], #8] \n\t" "subs %[h], %[h], #2 \n\t" "wldrd wr2, [%[pixels], #16] \n\t" "add %[pixels], %[pixels], %[line_size] \n\t" "wldrd wr3, [r4] \n\t" "pld [%[pixels]] \n\t" "pld [%[pixels], #32] \n\t" "walignr1 wr8, wr0, wr1 \n\t" "wldrd wr4, [r4, #8] \n\t" "walignr1 wr9, wr1, wr2 \n\t" "wldrd wr5, [r4, #16] \n\t" "add r4, r4, %[line_size] \n\t" "wldrd wr0, [%[block]] \n\t" "pld [r4] \n\t" "wldrd wr1, [%[block], #8] \n\t" "pld [r4, #32] \n\t" "wldrd wr2, [r5] \n\t" "walignr1 wr10, wr3, wr4 \n\t" "wldrd wr3, [r5, #8] \n\t" WAVG2B" wr8, wr8, wr0 \n\t" WAVG2B" wr9, wr9, wr1 \n\t" WAVG2B" wr10, wr10, wr2 \n\t" "wstrd wr8, [%[block]] \n\t" "walignr1 wr11, wr4, wr5 \n\t" WAVG2B" wr11, wr11, wr3 \n\t" "wstrd wr9, [%[block], #8] \n\t" "add %[block], %[block], %[line_size] \n\t" "wstrd wr10, [r5] \n\t" "pld [%[block]] \n\t" "pld [%[block], #32] \n\t" "wstrd wr11, [r5, #8] \n\t" "add r5, r5, %[line_size] \n\t" "pld [r5] \n\t" "pld [r5, #32] \n\t" "bne 1b \n\t" : [block]"+r"(block), [pixels]"+r"(pixels), [line_size]"+r"(stride), [h]"+r"(h) : : "memory", "r4", "r5", "r12");}void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h){ int stride = line_size; // [wr0 wr1 wr2 wr3] for previous line // [wr4 wr5 wr6 wr7] for current line SET_RND(wr15); // =2 for rnd and =1 for no_rnd version asm volatile( "pld [%[pixels]] \n\t" "pld [%[pixels], #32] \n\t" "and r12, %[pixels], #7 \n\t" "bic %[pixels], %[pixels], #7 \n\t" "tmcr wcgr1, r12 \n\t" "add r12, r12, #1 \n\t" "add r4, %[pixels], %[line_size]\n\t" "tmcr wcgr2, r12 \n\t" "add r5, %[block], %[line_size] \n\t" "mov %[line_size], %[line_size], lsl #1 \n\t" "1: \n\t" "wldrd wr10, [%[pixels]] \n\t" "cmp r12, #8 \n\t" "wldrd wr11, [%[pixels], #8] \n\t" "add %[pixels], %[pixels], %[line_size] \n\t" "wldrd wr13, [r4] \n\t" "pld [%[pixels]] \n\t" "wldrd wr14, [r4, #8] \n\t" "pld [%[pixels], #32] \n\t" "add r4, r4, %[line_size] \n\t" "walignr1 wr0, wr10, wr11 \n\t" "pld [r4] \n\t" "pld [r4, #32] \n\t" "walignr1 wr2, wr13, wr14 \n\t" "wmoveq wr4, wr11 \n\t" "wmoveq wr6, wr14 \n\t" "walignr2ne wr4, wr10, wr11 \n\t" "walignr2ne wr6, wr13, wr14 \n\t" WAVG2B" wr0, wr0, wr4 \n\t" WAVG2B" wr2, wr2, wr6 \n\t" "wstrd wr0, [%[block]] \n\t" "subs %[h], %[h], #2 \n\t" "wstrd wr2, [r5] \n\t" "add %[block], %[block], %[line_size] \n\t" "add r5, r5, %[line_size] \n\t" "bne 1b \n\t" : [h]"+r"(h), [pixels]"+r"(pixels), [block]"+r"(block), [line_size]"+r"(stride) : : "r4", "r5", "r12", "memory");}void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, const int line_size, int h){ int stride = line_size; // [wr0 wr1 wr2 wr3] for previous line // [wr4 wr5 wr6 wr7] for current line SET_RND(wr15); // =2 for rnd and =1 for no_rnd version asm volatile( "pld [%[pixels]] \n\t" "pld [%[pixels], #32] \n\t" "and r12, %[pixels], #7 \n\t" "bic %[pixels], %[pixels], #7 \n\t" "tmcr wcgr1, r12 \n\t" "add r12, r12, #1 \n\t" "add r4, %[pixels], %[line_size]\n\t" "tmcr wcgr2, r12 \n\t" "add r5, %[block], %[line_size] \n\t" "mov %[line_size], %[line_size], lsl #1 \n\t" "1: \n\t" "wldrd wr10, [%[pixels]] \n\t" "cmp r12, #8 \n\t" "wldrd wr11, [%[pixels], #8] \n\t" "wldrd wr12, [%[pixels], #16] \n\t" "add %[pixels], %[pixels], %[line_size] \n\t" "wldrd wr13, [r4] \n\t" "pld [%[pixels]] \n\t" "wldrd wr14, [r4, #8] \n\t" "pld [%[pixels], #32] \n\t" "wldrd wr15, [r4, #16] \n\t" "add r4, r4, %[line_size] \n\t" "walignr1 wr0, wr10, wr11 \n\t" "pld [r4] \n\t" "pld [r4, #32] \n\t" "walignr1 wr1, wr11, wr12 \n\t"
⌨️ 快捷键说明
复制代码
Ctrl + C
搜索代码
Ctrl + F
全屏模式
F11
切换主题
Ctrl + Shift + D
显示快捷键
?
增大字号
Ctrl + =
减小字号
Ctrl + -