/*
* Copyright © 2013 Raspberry Pi Foundation
* Copyright © 2013 RISC OS Open Ltd
*
* Permission to use, copy, modify, distribute, and sell this software and its
* documentation for any purpose is hereby granted without fee, provided that
* the above copyright notice appear in all copies and that both that
* copyright notice and this permission notice appear in supporting
* documentation, and that the name of the copyright holders not be used in
* advertising or publicity pertaining to distribution of the software without
* specific, written prior permission. The copyright holders make no
* representations about the suitability of this software for any purpose. It
* is provided "as is" without express or implied warranty.
*
* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
* SOFTWARE.
*
*/
#if ENABLE_FAST_BLT
#include <stddef.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include "BitBltInternal.h"
#define ROR(x,s) (((uint32_t)(x))>>(s)|((uint32_t)(x))<<((32-(s))))
#define MAX(a,b) ((a)>(b)?(a):(b))
#define MIN(a,b) ((a)<(b)?(a):(b))
static const uint8_t log2table[33] = { 0, 0, 1, 0, 2, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0,
4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5 };
#ifdef DEBUG
#define dprintf(args) do { check_printf args; } while (0)
#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
static int check_printf(char *format, ...)
{
static bool envChecked;
static bool debugEnabled;
int result = 0;
if (!envChecked) {
debugEnabled = getenv("DEBUG");
envChecked = true;
}
if (debugEnabled) {
va_list ap;
va_start(ap, format);
result = vprintf(format, ap);
va_end(ap);
}
return result;
}
#else
#define dprintf(args)
#endif
static void fastPathClearWord4(operation_t *op, uint32_t flags)
{
IGNORE(flags);
COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
uint32_t *dest = destBits + destPitch * destY + destX * 4 / 32;
uint32_t destXbitIndex = (destX * 4) & 31;
if (32 - (signed) (destXbitIndex + width * 4) >= 0) {
uint32_t mask = -1u << (32 - (destXbitIndex + 4 * width));
mask &= mask >> destXbitIndex;
do {
*dest = (*dest &~ mask) | (0 & mask);
dest += destPitch;
} while (--height > 0);
} else {
/* Don't bother rounding up, we won't increment dest for trailing word if any */
destPitch -= (destXbitIndex + width * 4) / 32;
do {
uint32_t x = width;
if (destXbitIndex > 0) {
uint32_t mask = -1u >> destXbitIndex;
*dest = (*dest &~ mask) | (0 & mask);
dest++;
x -= (32 - destXbitIndex) / 4;
}
uint32_t old_x;
while (old_x = x, x -= 32/4, old_x >= 32/4) {
*dest++ = 0;
}
if (x & (32/4-1)) {
uint32_t mask = -1u << (32 - (x & (32/4-1)) * 4);
*dest = (*dest &~ mask) | (0 & mask);
}
dest += destPitch;
} while (--height > 0);
}
}
static void fastPathClearWord8(operation_t *op, uint32_t flags)
{
IGNORE(flags);
COPY_OP_TO_LOCALS(op, uint32_t, uint8_t);
uint8_t *dest = destBits + destPitch * destY + (destX &~ 3);
/* Stride is defined to be an integer number of words, so there's actually
* 2 bits spare there - use them to hold the byte offset into first word */
destPitch = (destPitch >> 2) | (destX << 30);
if (4 - (signed)((destPitch >> 30) + width) > 0) {
do {
/* Lowest address offset at which to write */
uint32_t offset = 4 - (destPitch >> 30);
uint32_t data = 0;
data >>= (destPitch >> 30) * 8;
uint32_t old_x;
uint32_t x = width;
while (old_x = x, x--, old_x >= 1) {
dest[--offset] = data;
data >>= 8;
}
dest += destPitch << 2;
} while (--height > 0);
} else {
/* Don't bother rounding up, we won't increment dest for trailing word if any */
destPitch -= ((destPitch >> 30) + width) >> 2;
do {
uint32_t x = width;
uint32_t data = 0;
if (destPitch >> 30) {
uint32_t leading_pixels = 4 - (destPitch >> 30);
if (leading_pixels >= 2) {
((uint16_t *)dest)[0] = data;
data >>= 16;
}
if (leading_pixels > 2)
((uint8_t *)dest)[2] = data;
if (leading_pixels < 2)
((uint8_t *)dest)[0] = data;
dest += 4;
x -= leading_pixels;
}
uint32_t old_x;
while (old_x = x, x -= 32/8, old_x >= 32/8) {
*(uint32_t *)dest = 0;
dest += 4;
}
uint32_t trailing_pixels = x & 3;
if (trailing_pixels) {
uint32_t data = 0;
data >>= trailing_pixels * 8;
if (trailing_pixels > 2u) {
((uint8_t *)dest)[1] = data;
data >>= 8;
}
if (trailing_pixels >= 2u)
((uint16_t *)dest)[1] = data;
if (trailing_pixels < 2u)
((uint8_t *)dest)[3] = data;
}
dest += destPitch << 2;
} while (--height > 0);
}
}
static void fastPathClearWord32(operation_t *op, uint32_t flags)
{
IGNORE(flags);
COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
uint32_t *dest = destBits + destPitch * destY + destX;
do {
memset(dest, 0, width * sizeof (uint32_t));
dest += destPitch;
} while (--height > 0);
}
static void fastPathSourceWord0_32_scalar(operation_t *op, uint32_t flags)
{
IGNORE(flags);
COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
uint32_t halftoneScalar = (*op->halftoneBase)[0];
uint32_t *dest = destBits + destPitch * destY + destX;
do {
uint32_t x = width;
do
*dest++ = halftoneScalar;
while (--x > 0);
dest += destPitch - width;
} while (--height > 0);
}
static void fastPathSourceWord32_32(operation_t *op, uint32_t flags)
{
IGNORE(flags);
COPY_OP_TO_LOCALS(op, uint32_t, uint32_t);
uint32_t *src = srcBits + srcPitch * srcY + srcX;
uint32_t *dest = destBits + destPitch * destY + destX;
do {
memmove(dest, src, width * sizeof (uint32_t));
src += srcPitch;
dest += destPitch;
} while (--height > 0);
}
static void fastPathRightToLeft(operation_t *op, uint32_t flags)
{
/* To enable the majority of fast path implementations to forget about
* having to handle this case, we handle it by the use of a temporary
* buffer on the stack. This will live in the L1 cache most of the
* time and so not be as bad as it sounds. To further mitigate this
* overhead, we try to match the word-alignment of the source data to
* that of the destination data during the copy to the temporary buffer,
* and split the data across buffers at destination cacheline boundaries.
* We can make certain assumptions: the stride, colour depth and
* endianness of source and destination should be the same.
*/
uint32_t flagsToDest = (flags &~ ONLY_NO_OVERLAP) | FAST_PATH_NO_OVERLAP;
uint32_t flagsFromSrc = (flagsToDest &~ (ONLY_NO_COLOR_MAP | ONLY_NO_HALFTONE | FAST_PATH_CA_NO_GAMMA | FAST_PATH_CA_HAS_GAMMA))
| FAST_PATH_NO_COLOR_MAP | FAST_PATH_NO_HALFTONE;
void (*funcToDest)(operation_t *, uint32_t), (*funcFromSrc)(operation_t *, uint32_t);
funcToDest = lookupFastPath(op->combinationRule, flagsToDest);
if (funcToDest == NULL) {
copyBitsFallback(op, flags);
return;
}
if (op->combinationRule == CR_sourceWord && flagsToDest == flagsFromSrc)
funcFromSrc = funcToDest;
else {
funcFromSrc = lookupFastPath(CR_sourceWord, flagsFromSrc);
if (funcFromSrc == NULL) {
copyBitsFallback(op, flags);
return;
}
}
operation_t opFromSrc = *op;
operation_t opToDest = *op;
uint32_t shift = log2table[op->src.depth];
uint32_t stride = op->src.pitch;
uint32_t line = (uint32_t) op->src.bits;
/* Convert to pixels. It doesn't matter if we lose the MS bits of
* addresses, since they're passed down as pixel offsets anyway */
if (shift > 3) {
stride >>= shift - 3;
line >>= shift - 3;
} else if (shift < 3) {
stride <<= 3 - shift;
line <<= 3 - shift;
}
line += stride * op->src.y;
uint32_t cacheline_len = (CACHELINE_LEN*8) >> shift;
uint32_t src_x = op->src.x;
uint32_t dest_x = op->dest.x;
uint32_t width = op->width;
uint32_t height = op->height;
uint8_t tempBuffer[CACHELINE_LEN * 64];
#define BUFFER_LEN_PIXELS (cacheline_len * (sizeof tempBuffer / CACHELINE_LEN))
opFromSrc.dest.bits = tempBuffer;
opFromSrc.dest.y = 0;
opFromSrc.height = 1;
opFromSrc.cmFlags = 0;
opFromSrc.cmMask = 0;
opFromSrc.cmLookupTable = NULL;
opFromSrc.noHalftone = true;
opFromSrc.halftoneBase = NULL;
opToDest.src.bits = tempBuffer;
opToDest.src.y = 0;
opToDest.height = 1;
do {
uint32_t firstCacheline = (line + dest_x) & -cacheline_len;
uint32_t lastPixelRemaining = line + dest_x + width;
uint32_t chunkBase = ((lastPixelRemaining + cacheline_len - 1) & -cacheline_len) - BUFFER_LEN_PIXELS;
/* Working from the right, process buffer-size chunks, breaking
* at cacheline boundaries. The slightly unusual comparison is
* to handle address wrapping since we may have shifted some
* address bits off the top of the word (having more than 2
* million pixels on one line is rather less likely). */
opFromSrc.dest.x = opToDest.src.x = 0;
while ((int32_t)(chunkBase - firstCacheline) > 0) {
opToDest.dest.x = chunkBase - line;
opFromSrc.src.x = src_x - dest_x + opToDest.dest.x;
opFromSrc.width = opToDest.width = lastPixelRemaining - chunkBase;
funcFromSrc(&opFromSrc, flagsFromSrc);
funcToDest(&opToDest, flagsToDest);
lastPixelRemaining = chunkBase;
chunkBase -= BUFFER_LEN_PIXELS;
}
/* In general, the dest below won't start cacheline-aligned,
* but if we maintain the offset from its cacheline then we at
* least ensure no word skew in the second operation. */
opFromSrc.dest.x = opToDest.src.x = line + dest_x - firstCacheline;
opToDest.dest.x = dest_x;
opFromSrc.src.x = src_x;
opFromSrc.width = opToDest.width = lastPixelRemaining - (line + dest_x);
funcFromSrc(&opFromSrc, flagsFromSrc);
funcToDest(&opToDest, flagsToDest);
line += stride;
opFromSrc.src.y = ++opToDest.dest.y;
} while (--height > 0);
}
static void fastPathBottomToTop(operation_t *op, uint32_t flags)
{
uint32_t flags2 = (flags &~ FAST_PATH_V_OVERLAP) | FAST_PATH_NO_OVERLAP;
void (*func)(operation_t *, uint32_t) = lookupFastPath(op->combinationRule, flags2);
if (func == NULL) {
copyBitsFallback(op, flags);}
else {
/* As long as vector halftone isn't in use, this is just a matter of
* processing the scanlines in the opposite order */
operation_t op2 = *op;
op2.src.bits = (uint8_t *) op->src.bits + (op->src.y + op->height - 1) * op->src.pitch;
op2.src.y = 0;
op2.dest.bits = (uint8_t *) op->dest.bits + (op->dest.y + op->height - 1) * op->dest.pitch;
op2.dest.y = 0;
op2.src.pitch = -op->src.pitch;
op2.dest.pitch = -op->dest.pitch;
func(&op2, flags2);
}
}
static void fastPathDepthConv(operation_t *op, uint32_t flags)
{
uint32_t flagsToDest = (flags &~ (ONLY_SRC_0BPP | ONLY_NO_COLOR_MAP)) |
((flags & (FAST_PATH_DEST_1BPP | ONLY_DEST_1BPP)) / (FAST_PATH_DEST_1BPP / FAST_PATH_SRC_1BPP)) |
FAST_PATH_NO_COLOR_MAP;
uint32_t flagsFromSrc = (flags &~ ONLY_NO_HALFTONE) | FAST_PATH_NO_HALFTONE;
void (*funcToDest)(operation_t *, uint32_t), (*funcFromSrc)(operation_t *, uint32_t);
funcToDest = lookupFastPath(op->combinationRule, flagsToDest);
if (funcToDest == NULL) {
copyBitsFallback(op, flags);
return;
}
if (op->combinationRule == CR_sourceWord) {
/* This trick requires independent implementations of each
* colour depth conversion using the sourceWord combinationRule.
* On platforms where these are not available, we end up here,
* but the lookup below would cause infinite recursion, so bail
* out beforehand. */
copyBitsFallback(op, flags);
return;
}
funcFromSrc = lookupFastPath(CR_sourceWord, flagsFromSrc);
if (funcFromSrc == NULL) {
copyBitsFallback(op, flags);
return;
}
operation_t opFromSrc = *op;
operation_t opToDest = *op;
uint32_t shift = log2table[op->dest.depth];
uint32_t stride = op->src.pitch;
uint32_t line = (uint32_t) op->dest.bits;
/* Convert to pixels. It doesn't matter if we lose the MS bits of
* addresses, since they're passed down as pixel offsets anyway */
if (shift > 3) {
stride >>= shift - 3;
line >>= shift - 3;
} else if (shift < 3) {
stride <<= 3 - shift;
line <<= 3 - shift;
}
line += stride * op->dest.y;
uint32_t cacheline_len = (CACHELINE_LEN*8) >> shift;
uint32_t src_x = op->src.x;
uint32_t dest_x = op->dest.x;
uint32_t width = op->width;
uint32_t height = op->height;
uint8_t tempBuffer[CACHELINE_LEN * 64];
#define BUFFER_LEN_PIXELS (cacheline_len * (sizeof tempBuffer / CACHELINE_LEN))
opFromSrc.combinationRule = CR_sourceWord;
opFromSrc.dest.bits = tempBuffer;
opFromSrc.dest.y = 0;
opFromSrc.height = 1;
opToDest.src.bits = tempBuffer;
opToDest.src.depth = op->dest.depth;
opToDest.src.pitch = op->dest.pitch;
opToDest.src.y = 0;
opToDest.height = 1;
opToDest.cmFlags = 0;
opToDest.cmMask = 0;
opToDest.cmLookupTable = NULL;
opToDest.noHalftone = true;
opToDest.halftoneBase = NULL;
do {
/* Working from left to right, process chunks of the size of
* the temporary buffer (measured in pixels at a depth that
* matches the depth of the destination), breaking at pixels
* that correspond to cacheline boundaries at the destination. */
uint32_t lastPixel = (line + dest_x + width);
uint32_t chunkBase = (line + dest_x) & -cacheline_len;
uint32_t chunkLimit = chunkBase + BUFFER_LEN_PIXELS;
opFromSrc.src.x = src_x;
opToDest.dest.x = dest_x;
opFromSrc.width = opToDest.width = chunkLimit - (line + dest_x);
opFromSrc.dest.x = opToDest.src.x = BUFFER_LEN_PIXELS - opFromSrc.width;
while ((int32_t)(chunkLimit - lastPixel) < 0) {
funcFromSrc(&opFromSrc, flagsFromSrc);
funcToDest(&opToDest, flagsToDest);
chunkBase = chunkLimit;
chunkLimit = chunkBase + BUFFER_LEN_PIXELS;
opFromSrc.src.x += opFromSrc.width;
opToDest.dest.x += opFromSrc.width;
opFromSrc.width = opToDest.width = BUFFER_LEN_PIXELS;
opFromSrc.dest.x = opToDest.src.x = 0;
}
/* In general, the dest below won't start cacheline-aligned,
* but if we maintain the offset from its cacheline then we at
* least ensure no word skew in the second operation. */
opFromSrc.dest.x = opToDest.src.x = opToDest.dest.x & (cacheline_len - 1);
opFromSrc.width = opToDest.width = lastPixel - (line + opToDest.dest.x);
funcFromSrc(&opFromSrc, flagsFromSrc);
funcToDest(&opToDest, flagsToDest);
line += stride;
++opFromSrc.src.y;
++opToDest.dest.y;
} while (--height > 0);
}
static void fastPathNoOp(operation_t *op, uint32_t flags)
{
IGNORE(op);
IGNORE(flags);
}
static fast_path_t fastPaths[] = {
{ fastPathClearWord4, CR_clearWord, STD_FLAGS_NO_SOURCE(4,NO) },
{ fastPathClearWord8, CR_clearWord, STD_FLAGS_NO_SOURCE(8,NO) },
{ fastPathClearWord32, CR_clearWord, STD_FLAGS_NO_SOURCE(32,NO) },
{ fastPathSourceWord0_32_scalar, CR_sourceWord, STD_FLAGS_NO_SOURCE(32,SCALAR) },
{ fastPathSourceWord32_32, CR_sourceWord, STD_FLAGS(32,32,NO,NO) &~ FAST_PATH_H_OVERLAP },
{ fastPathNoOp, CR_destinationWord, 0 },
/* Some special fast paths to extend the abilities of the others in corner cases */
{ fastPathRightToLeft, CR_any, FAST_PATH_VECTOR_HALFTONE | ONLY_H_OVERLAP },
{ fastPathBottomToTop, CR_any, FAST_PATH_VECTOR_HALFTONE | ONLY_V_OVERLAP },
{ fastPathDepthConv, CR_any, FAST_PATH_SRC_0BPP | FAST_PATH_SRC_32BPP | ONLY_DEST_32BPP },
{ fastPathDepthConv, CR_any, FAST_PATH_SRC_0BPP | FAST_PATH_SRC_16BPP | ONLY_DEST_16BPP },
{ fastPathDepthConv, CR_any, FAST_PATH_SRC_0BPP | FAST_PATH_SRC_8BPP | ONLY_DEST_8BPP },
{ fastPathDepthConv, CR_any, FAST_PATH_SRC_0BPP | FAST_PATH_SRC_4BPP | ONLY_DEST_4BPP },
{ fastPathDepthConv, CR_any, FAST_PATH_SRC_0BPP | FAST_PATH_SRC_2BPP | ONLY_DEST_2BPP },
{ fastPathDepthConv, CR_any, FAST_PATH_SRC_0BPP | FAST_PATH_SRC_1BPP | ONLY_DEST_1BPP },
};
void addGenericFastPaths(void)
{
addFastPaths(fastPaths, sizeof fastPaths / sizeof *fastPaths);
}
#endif /* ENABLE_FAST_BLT */
|