/*
* Copyright © 2013 Raspberry Pi Foundation
* Copyright © 2013 RISC OS Open Ltd
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of the copyright holders not be used in
* advertising or publicity pertaining to distribution of the software without
* specific, written prior permission. The copyright holders make no
* representations about the suitability of this software for any purpose. It
* is provided "as is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*
*/
#if ENABLE_FAST_BLT
#include <stddef.h>
#include <stdint.h>
#include "BitBltInternal.h"
enum {
HALFTONE_NONE,
HALFTONE_SCALAR,
HALFTONE_VECTOR
};
//typedef void (*armSimdAsmFn)(uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...);
#define FAST_PATH(op, src_bpp, dst_bpp, qualifier, halftone_type) \
extern void armSimd##op##src_bpp##_##dst_bpp##qualifier##_wide (uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...); \
extern void armSimd##op##src_bpp##_##dst_bpp##qualifier##_narrow(uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...); \
extern void armSimd##op##src_bpp##_##dst_bpp##qualifier##_tiny (uint32_t width, uint32_t height, uint32_t *dst, uint32_t dstStride, uint32_t *src, uint32_t srcStride, uint32_t halftone, uint32_t halftoneInfo, uint32_t *colourMap, uint32_t bitPtrs, ...); \
static void fastPath##op##src_bpp##_##dst_bpp##qualifier(operation_t *op, uint32_t flags) \
{ \
IGNORE(flags); \
/* Copy certain parts of the operation structure to locals to help compiler */ \
uint32_t *srcBits = op->src.bits; \
uint32_t srcPitch = op->src.pitch / sizeof (uint32_t); \
uint32_t srcX = op->src.x; \
uint32_t srcY = op->src.y; \
uint32_t *dstBits = op->dest.bits; \
uint32_t dstPitch = op->dest.pitch / sizeof (uint32_t); \
uint32_t dstX = op->dest.x; \
uint32_t dstY = op->dest.y; \
uint32_t width = op->width; \
uint32_t height = op->height; \
uint32_t *cmLookupTable = *op->cmLookupTable; \
uint32_t halftoneHeight = op->halftoneHeight; \
uint32_t *halftoneBase = (uint32_t *) *op->halftoneBase; \
/* Get pointers to initial words */ \
uint32_t *src = 0; \
if (src_bpp > 0) \
src = srcBits + srcPitch * srcY + srcX * src_bpp / 32; \
uint32_t *dst = dstBits + dstPitch * dstY + dstX * dst_bpp / 32; \
/* Get initial pixel offset within words, mangle into pitch if possible */ \
uint32_t bitPtrs = 0; \
uint32_t srcXpix = 0; \
if (src_bpp > 0) { \
srcXpix = srcX & (31 / (src_bpp == 0 ? 1 : src_bpp)); /* ?: to avoid compiler warning on GCC! */ \
if (src_bpp < 8) \
bitPtrs = srcXpix << 27; \
else if (src_bpp == 8 || src_bpp == 16) \
srcPitch |= srcXpix << 30; \
} \
uint32_t dstXpix = dstX & (31/dst_bpp); \
if (dst_bpp < 8) \
bitPtrs |= dstXpix; \
else if (dst_bpp == 8 || dst_bpp == 16) \
dstPitch |= dstXpix << 30; \
/* Adjust strides to remove number of words partially or wholly read/written */ \
if (src_bpp > 0) \
srcPitch -= (src_bpp * (srcXpix + width) + 31) / 32; \
dstPitch -= (dst_bpp * (dstXpix + width) + 31) / 32; \
/* Deal with halftoning */ \
uint32_t halftone = 0; \
uint32_t halftoneInfo = 0; \
if (halftone_type == HALFTONE_SCALAR) \
halftone = halftoneBase[0]; \
else if (halftone_type == HALFTONE_VECTOR) { \
halftone = (uint32_t) (halftoneBase + halftoneHeight); \
halftoneInfo = (((dstY % halftoneHeight) - halftoneHeight) << 17) | (-halftoneHeight & 0x7FFF); \
} \
/* Work out which width class this operation is. \
* Rather than re-evaluate this for each line, we want one choice \
* for the whole operation; this means we can't assume anything about \
* alignment to sizes larger than 4 bytes, because that's the only \
* guarantee we have about line stride. */ \
if (width > (128-32)/dst_bpp && (((dstXpix-1) ^ (dstXpix+width-(128-32)/dst_bpp)) &~ (31/dst_bpp))) \
armSimd##op##src_bpp##_##dst_bpp##qualifier##_wide(width, height, dst, dstPitch, src, srcPitch, halftone, halftoneInfo, cmLookupTable, bitPtrs); \
else if (dst_bpp > 8 || (((dstXpix-1) ^ (dstXpix+width)) &~ (31/dst_bpp))) \
armSimd##op##src_bpp##_##dst_bpp##qualifier##_narrow(width, height, dst, dstPitch, src, srcPitch, halftone, halftoneInfo, cmLookupTable, bitPtrs); \
else \
armSimd##op##src_bpp##_##dst_bpp##qualifier##_tiny(width, height, dst, dstPitch, src, srcPitch, halftone, halftoneInfo, cmLookupTable, bitPtrs); \
}
FAST_PATH(SourceWord,1,32,,HALFTONE_NONE)
FAST_PATH(SourceWord,1,16,,HALFTONE_NONE)
FAST_PATH(SourceWord,2,32,,HALFTONE_NONE)
FAST_PATH(SourceWord,1,8,,HALFTONE_NONE)
FAST_PATH(SourceWord,2,16,,HALFTONE_NONE)
FAST_PATH(SourceWord,4,32,,HALFTONE_NONE)
FAST_PATH(SourceWord,1,4,,HALFTONE_NONE)
FAST_PATH(SourceWord,2,8,,HALFTONE_NONE)
FAST_PATH(SourceWord,4,16,,HALFTONE_NONE)
FAST_PATH(SourceWord,8,32,,HALFTONE_NONE)
FAST_PATH(SourceWord,1,2,,HALFTONE_NONE)
FAST_PATH(SourceWord,2,4,,HALFTONE_NONE)
FAST_PATH(SourceWord,4,8,,HALFTONE_NONE)
FAST_PATH(SourceWord,8,16,,HALFTONE_NONE)
FAST_PATH(SourceWord,16,32,,HALFTONE_NONE)
FAST_PATH(SourceWord,1,1,,HALFTONE_NONE)
FAST_PATH(SourceWord,2,2,,HALFTONE_NONE)
FAST_PATH(SourceWord,4,4,,HALFTONE_NONE)
FAST_PATH(SourceWord,8,8,,HALFTONE_NONE)
FAST_PATH(SourceWord,16,16,,HALFTONE_NONE)
FAST_PATH(SourceWord,32,32,,HALFTONE_NONE)
FAST_PATH(SourceWord,2,1,,HALFTONE_NONE)
FAST_PATH(SourceWord,4,2,,HALFTONE_NONE)
FAST_PATH(SourceWord,8,4,,HALFTONE_NONE)
FAST_PATH(SourceWord,16,8,,HALFTONE_NONE)
FAST_PATH(SourceWord,32,16,,HALFTONE_NONE)
FAST_PATH(SourceWord,4,1,,HALFTONE_NONE)
FAST_PATH(SourceWord,8,2,,HALFTONE_NONE)
FAST_PATH(SourceWord,16,4,,HALFTONE_NONE)
FAST_PATH(SourceWord,32,8,,HALFTONE_NONE)
FAST_PATH(SourceWord,8,1,,HALFTONE_NONE)
FAST_PATH(SourceWord,16,2,,HALFTONE_NONE)
FAST_PATH(SourceWord,32,4,,HALFTONE_NONE)
FAST_PATH(SourceWord,16,1,,HALFTONE_NONE)
FAST_PATH(SourceWord,32,2,,HALFTONE_NONE)
FAST_PATH(SourceWord,32,1,,HALFTONE_NONE)
FAST_PATH(SourceWord,0,1,,HALFTONE_NONE)
FAST_PATH(SourceWord,0,1,_scalar,HALFTONE_SCALAR)
FAST_PATH(SourceWord,0,2,,HALFTONE_NONE)
FAST_PATH(SourceWord,0,2,_scalar,HALFTONE_SCALAR)
FAST_PATH(SourceWord,0,4,,HALFTONE_NONE)
FAST_PATH(SourceWord,0,4,_scalar,HALFTONE_SCALAR)
FAST_PATH(SourceWord,0,8,,HALFTONE_NONE)
FAST_PATH(SourceWord,0,8,_scalar,HALFTONE_SCALAR)
FAST_PATH(SourceWord,0,16,,HALFTONE_NONE)
FAST_PATH(SourceWord,0,16,_scalar,HALFTONE_SCALAR)
FAST_PATH(SourceWord,0,32,,HALFTONE_NONE)
FAST_PATH(SourceWord,0,32,_scalar,HALFTONE_SCALAR)
FAST_PATH(PixPaint,1,1,,HALFTONE_NONE)
FAST_PATH(PixPaint,2,2,,HALFTONE_NONE)
FAST_PATH(PixPaint,4,4,,HALFTONE_NONE)
FAST_PATH(PixPaint,8,8,,HALFTONE_NONE)
FAST_PATH(PixPaint,16,16,,HALFTONE_NONE)
FAST_PATH(PixPaint,32,32,,HALFTONE_NONE)
FAST_PATH(AlphaBlend,32,32,,HALFTONE_NONE)
FAST_PATH(BitAnd,1,1,,HALFTONE_NONE)
FAST_PATH(BitAnd,2,2,,HALFTONE_NONE)
FAST_PATH(BitAnd,4,4,,HALFTONE_NONE)
FAST_PATH(BitAnd,8,8,,HALFTONE_NONE)
FAST_PATH(BitAnd,16,16,,HALFTONE_NONE)
FAST_PATH(BitAnd,32,32,,HALFTONE_NONE)
static fast_path_t fastPaths[] = {
{ fastPathSourceWord1_32, CR_sourceWord, STD_FLAGS(1,32,DIRECT,NO) },
{ fastPathSourceWord1_16, CR_sourceWord, STD_FLAGS(1,16,DIRECT,NO) },
{ fastPathSourceWord2_32, CR_sourceWord, STD_FLAGS(2,32,DIRECT,NO) },
{ fastPathSourceWord1_8, CR_sourceWord, STD_FLAGS(1,8,DIRECT,NO) },
{ fastPathSourceWord2_16, CR_sourceWord, STD_FLAGS(2,16,DIRECT,NO) },
{ fastPathSourceWord4_32, CR_sourceWord, STD_FLAGS(4,32,DIRECT,NO) },
{ fastPathSourceWord1_4, CR_sourceWord, STD_FLAGS(1,4,DIRECT,NO) },
{ fastPathSourceWord2_8, CR_sourceWord, STD_FLAGS(2,8,DIRECT,NO) },
{ fastPathSourceWord4_16, CR_sourceWord, STD_FLAGS(4,16,DIRECT,NO) },
{ fastPathSourceWord8_32, CR_sourceWord, STD_FLAGS(8,32,DIRECT,NO) },
{ fastPathSourceWord1_2, CR_sourceWord, STD_FLAGS(1,2,DIRECT,NO) },
{ fastPathSourceWord2_4, CR_sourceWord, STD_FLAGS(2,4,DIRECT,NO) },
{ fastPathSourceWord4_8, CR_sourceWord, STD_FLAGS(4,8,DIRECT,NO) },
{ fastPathSourceWord8_16, CR_sourceWord, STD_FLAGS(8,16,DIRECT,NO) },
{ fastPathSourceWord16_32, CR_sourceWord, STD_FLAGS(16,32,NO,NO) },
{ fastPathSourceWord1_1, CR_sourceWord, STD_FLAGS(1,1,NO,NO) },
{ fastPathSourceWord2_2, CR_sourceWord, STD_FLAGS(2,2,NO,NO) },
{ fastPathSourceWord4_4, CR_sourceWord, STD_FLAGS(4,4,NO,NO) },
{ fastPathSourceWord8_8, CR_sourceWord, STD_FLAGS(8,8,NO,NO) },
{ fastPathSourceWord16_16, CR_sourceWord, STD_FLAGS(16,16,NO,NO) },
{ fastPathSourceWord32_32, CR_sourceWord, STD_FLAGS(32,32,NO,NO) },
{ fastPathSourceWord2_1, CR_sourceWord, STD_FLAGS(2,1,DIRECT,NO) },
{ fastPathSourceWord4_2, CR_sourceWord, STD_FLAGS(4,2,DIRECT,NO) },
{ fastPathSourceWord8_4, CR_sourceWord, STD_FLAGS(8,4,DIRECT,NO) },
{ fastPathSourceWord16_8, CR_sourceWord, STD_FLAGS(16,8,DIRECT,NO) },
{ fastPathSourceWord32_16, CR_sourceWord, STD_FLAGS(32,16,NO,NO) },
{ fastPathSourceWord4_1, CR_sourceWord, STD_FLAGS(4,1,DIRECT,NO) },
{ fastPathSourceWord8_2, CR_sourceWord, STD_FLAGS(8,2,DIRECT,NO) },
{ fastPathSourceWord16_4, CR_sourceWord, STD_FLAGS(16,4,DIRECT,NO) },
{ fastPathSourceWord32_8, CR_sourceWord, STD_FLAGS(32,8,15BIT,NO) },
{ fastPathSourceWord8_1, CR_sourceWord, STD_FLAGS(8,1,DIRECT,NO) },
{ fastPathSourceWord16_2, CR_sourceWord, STD_FLAGS(16,2,DIRECT,NO) },
{ fastPathSourceWord32_4, CR_sourceWord, STD_FLAGS(32,4,15BIT,NO) },
{ fastPathSourceWord16_1, CR_sourceWord, STD_FLAGS(16,1,DIRECT,NO) },
{ fastPathSourceWord32_2, CR_sourceWord, STD_FLAGS(32,2,15BIT,NO) },
{ fastPathSourceWord32_1, CR_sourceWord, STD_FLAGS(32,1,15BIT,NO) },
{ fastPathSourceWord0_1, CR_sourceWord, STD_FLAGS_NO_SOURCE(1,NO) },
{ fastPathSourceWord0_1_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(1,SCALAR) },
{ fastPathSourceWord0_2, CR_sourceWord, STD_FLAGS_NO_SOURCE(2,NO) },
{ fastPathSourceWord0_2_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(2,SCALAR) },
{ fastPathSourceWord0_4, CR_sourceWord, STD_FLAGS_NO_SOURCE(4,NO) },
{ fastPathSourceWord0_4_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(4,SCALAR) },
{ fastPathSourceWord0_8, CR_sourceWord, STD_FLAGS_NO_SOURCE(8,NO) },
{ fastPathSourceWord0_8_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(8,SCALAR) },
{ fastPathSourceWord0_16, CR_sourceWord, STD_FLAGS_NO_SOURCE(16,NO) },
{ fastPathSourceWord0_16_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(16,SCALAR) },
{ fastPathSourceWord0_32, CR_sourceWord, STD_FLAGS_NO_SOURCE(32,NO) },
{ fastPathSourceWord0_32_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(32,SCALAR) },
{ fastPathPixPaint1_1, CR_pixPaint, STD_FLAGS(1,1,NO,NO) },
{ fastPathPixPaint2_2, CR_pixPaint, STD_FLAGS(2,2,NO,NO) },
{ fastPathPixPaint4_4, CR_pixPaint, STD_FLAGS(4,4,NO,NO) },
{ fastPathPixPaint8_8, CR_pixPaint, STD_FLAGS(8,8,NO,NO) },
{ fastPathPixPaint16_16, CR_pixPaint, STD_FLAGS(16,16,NO,NO) },
{ fastPathPixPaint32_32, CR_pixPaint, STD_FLAGS(32,32,NO,NO) },
{ fastPathAlphaBlend32_32, CR_alphaBlend, STD_FLAGS(32,32,NO,NO) },
{ fastPathBitAnd1_1, CR_bitAnd, STD_FLAGS(1,1,NO,NO) },
{ fastPathBitAnd2_2, CR_bitAnd, STD_FLAGS(2,2,NO,NO) },
{ fastPathBitAnd4_4, CR_bitAnd, STD_FLAGS(4,4,NO,NO) },
{ fastPathBitAnd8_8, CR_bitAnd, STD_FLAGS(8,8,NO,NO) },
{ fastPathBitAnd16_16, CR_bitAnd, STD_FLAGS(16,16,NO,NO) },
{ fastPathBitAnd32_32, CR_bitAnd, STD_FLAGS(32,32,NO,NO) },
};
void addArmSimdFastPaths(void)
{
addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
}
#endif /* ENABLE_FAST_BLT */
|