qemu/pixman/pixman/pixman-ssse3.c

   1 /*
   2  * Copyright © 2013 Soren Sandmann Pedersen
   3  * Copyright © 2013 Red Hat, Inc.
   4  *
   5  * Permission is hereby granted, free of charge, to any person obtaining a
   6  * copy of this software and associated documentation files (the "Software"),
   7  * to deal in the Software without restriction, including without limitation
   8  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   9  * and/or sell copies of the Software, and to permit persons to whom the
  10  * Software is furnished to do so, subject to the following conditions:
  11  *
  12  * The above copyright notice and this permission notice (including the next
  13  * paragraph) shall be included in all copies or substantial portions of the
  14  * Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  22  * DEALINGS IN THE SOFTWARE.
  23  *
  24  * Author: Soren Sandmann (soren.sandmann@gmail.com)
  25  */
  26 #ifdef HAVE_CONFIG_H
  27 #include <config.h>
  28 #endif
  29
  30 #include <stdlib.h>
  31 #include <mmintrin.h>
  32 #include <xmmintrin.h>
  33 #include <emmintrin.h>
  34 #include <tmmintrin.h>
  35 #include "pixman-private.h"
  36 #include "pixman-inlines.h"
  37
  38 typedef struct
  39 {
  40     int         y;
  41     uint64_t *  buffer;
  42 } line_t;
  43
  44 typedef struct
  45 {
  46     line_t              lines[2];
  47     pixman_fixed_t      y;
  48     pixman_fixed_t      x;
  49     uint64_t            data[1];
  50 } bilinear_info_t;
  51
  52 static void
  53 ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
  54                         int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
  55 {
  56     uint32_t *bits = image->bits + y * image->rowstride;
  57     __m128i vx = _mm_set_epi16 (
  58         - (x + 1), x, - (x + 1), x,
  59         - (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
  60     __m128i vux = _mm_set_epi16 (
  61         - 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
  62         - 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
  63     __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
  64     __m128i *b = (__m128i *)line->buffer;
  65     __m128i vrl0, vrl1;
  66
  67     while ((n -= 2) >= 0)
  68     {
  69         __m128i vw, vr, s;
  70
  71         vrl1 = _mm_loadl_epi64 (
  72             (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
  73         /* vrl1: R1, L1 */
  74
  75     final_pixel:
  76         vrl0 = _mm_loadl_epi64 (
  77             (__m128i *)(bits + pixman_fixed_to_int (x)));
  78         /* vrl0: R0, L0 */
  79
  80         /* The weights are based on vx which is a vector of
  81          *
  82          *    - (x + 1), x, - (x + 1), x,
  83          *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
  84          *
  85          * so the 16 bit weights end up like this:
  86          *
  87          *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
  88          *
  89          * and after shifting and packing, we get these bytes:
  90          *
  91          *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
  92          *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
  93          *
  94          * which means the first and the second input pixel
  95          * have to be interleaved like this:
  96          *
  97          *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
  98          *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
  99          *
 100          * before maddubsw can be used.
 101          */
 102
 103         vw = _mm_add_epi16 (
 104             vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
 105         /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
 106          */
 107
 108         vw = _mm_packus_epi16 (vw, vw);
 109         /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
 110          *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
 111          */
 112         vx = _mm_add_epi16 (vx, vux);
 113
 114         x += 2 * ux;
 115
 116         vr = _mm_unpacklo_epi16 (vrl1, vrl0);
 117         /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
 118
 119         s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
 120         /* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
 121
 122         vr = _mm_unpackhi_epi8 (vr, s);
 123         /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
 124          *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
 125          */
 126
 127         vr = _mm_maddubs_epi16 (vr, vw);
 128
 129         /* When the weight is 0, the inverse weight is
 130          * 128 which can't be represented in a signed byte.
 131          * As a result maddubsw computes the following:
 132          *
 133          *     r = l * -128 + r * 0
 134          *
 135          * rather than the desired
 136          *
 137          *     r = l * 128 + r * 0
 138          *
 139          * We fix this by taking the absolute value of the
 140          * result.
 141          */
 142         vr = _mm_abs_epi16 (vr);
 143
 144         /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
 145         _mm_store_si128 (b++, vr);
 146     }
 147
 148     if (n == -1)
 149     {
 150         vrl1 = _mm_setzero_si128();
 151         goto final_pixel;
 152     }
 153
 154     line->y = y;
 155 }
 156
 157 static uint32_t *
 158 ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
 159 {
 160     pixman_fixed_t fx, ux;
 161     bilinear_info_t *info = iter->data;
 162     line_t *line0, *line1;
 163     int y0, y1;
 164     int32_t dist_y;
 165     __m128i vw;
 166     int i;
 167
 168     fx = info->x;
 169     ux = iter->image->common.transform->matrix[0][0];
 170
 171     y0 = pixman_fixed_to_int (info->y);
 172     y1 = y0 + 1;
 173
 174     line0 = &info->lines[y0 & 0x01];
 175     line1 = &info->lines[y1 & 0x01];
 176
 177     if (line0->y != y0)
 178     {
 179         ssse3_fetch_horizontal (
 180             &iter->image->bits, line0, y0, fx, ux, iter->width);
 181     }
 182
 183     if (line1->y != y1)
 184     {
 185         ssse3_fetch_horizontal (
 186             &iter->image->bits, line1, y1, fx, ux, iter->width);
 187     }
 188
 189     dist_y = pixman_fixed_to_bilinear_weight (info->y);
 190     dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
 191
 192     vw = _mm_set_epi16 (
 193         dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
 194
 195     for (i = 0; i + 3 < iter->width; i += 4)
 196     {
 197         __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
 198         __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
 199         __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
 200         __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
 201         __m128i r0, r1, tmp, p;
 202
 203         r0 = _mm_mulhi_epu16 (
 204             _mm_sub_epi16 (bot0, top0), vw);
 205         tmp = _mm_cmplt_epi16 (bot0, top0);
 206         tmp = _mm_and_si128 (tmp, vw);
 207         r0 = _mm_sub_epi16 (r0, tmp);
 208         r0 = _mm_add_epi16 (r0, top0);
 209         r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
 210         /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
 211         r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
 212         /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
 213
 214         r1 = _mm_mulhi_epu16 (
 215             _mm_sub_epi16 (bot1, top1), vw);
 216         tmp = _mm_cmplt_epi16 (bot1, top1);
 217         tmp = _mm_and_si128 (tmp, vw);
 218         r1 = _mm_sub_epi16 (r1, tmp);
 219         r1 = _mm_add_epi16 (r1, top1);
 220         r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
 221         r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
 222         /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
 223
 224         p = _mm_packus_epi16 (r0, r1);
 225
 226         _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
 227     }
 228
 229     while (i < iter->width)
 230     {
 231         __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
 232         __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
 233         __m128i r0, tmp, p;
 234
 235         r0 = _mm_mulhi_epu16 (
 236             _mm_sub_epi16 (bot0, top0), vw);
 237         tmp = _mm_cmplt_epi16 (bot0, top0);
 238         tmp = _mm_and_si128 (tmp, vw);
 239         r0 = _mm_sub_epi16 (r0, tmp);
 240         r0 = _mm_add_epi16 (r0, top0);
 241         r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
 242         /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
 243         r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
 244         /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
 245
 246         p = _mm_packus_epi16 (r0, r0);
 247
 248         if (iter->width - i == 1)
 249         {
 250             *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
 251             i++;
 252         }
 253         else
 254         {
 255             _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
 256             i += 2;
 257         }
 258     }
 259
 260     info->y += iter->image->common.transform->matrix[1][1];
 261
 262     return iter->buffer;
 263 }
 264
 265 static void
 266 ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
 267 {
 268     free (iter->data);
 269 }
 270
 271 static void
 272 ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
 273 {
 274     int width = iter->width;
 275     bilinear_info_t *info;
 276     pixman_vector_t v;
 277
 278     /* Reference point is the center of the pixel */
 279     v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
 280     v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
 281     v.vector[2] = pixman_fixed_1;
 282
 283     if (!pixman_transform_point_3d (iter->image->common.transform, &v))
 284         goto fail;
 285
 286     info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
 287     if (!info)
 288         goto fail;
 289
 290     info->x = v.vector[0] - pixman_fixed_1 / 2;
 291     info->y = v.vector[1] - pixman_fixed_1 / 2;
 292
 293 #define ALIGN(addr)                                                     \
 294     ((void *)((((uintptr_t)(addr)) + 15) & (~15)))
 295
 296     /* It is safe to set the y coordinates to -1 initially
 297      * because COVER_CLIP_BILINEAR ensures that we will only
 298      * be asked to fetch lines in the [0, height) interval
 299      */
 300     info->lines[0].y = -1;
 301     info->lines[0].buffer = ALIGN (&(info->data[0]));
 302     info->lines[1].y = -1;
 303     info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);
 304
 305     iter->get_scanline = ssse3_fetch_bilinear_cover;
 306     iter->fini = ssse3_bilinear_cover_iter_fini;
 307
 308     iter->data = info;
 309     return;
 310
 311 fail:
 312     /* Something went wrong, either a bad matrix or OOM; in such cases,
 313      * we don't guarantee any particular rendering.
 314      */
 315     _pixman_log_error (
 316         FUNC, "Allocation failure or bad matrix, skipping rendering\n");
 317
 318     iter->get_scanline = _pixman_iter_get_scanline_noop;
 319     iter->fini = NULL;
 320 }
 321
 322 static const pixman_iter_info_t ssse3_iters[] =
 323 {
 324     { PIXMAN_a8r8g8b8,
 325       (FAST_PATH_STANDARD_FLAGS                 |
 326        FAST_PATH_SCALE_TRANSFORM                |
 327        FAST_PATH_BILINEAR_FILTER                |
 328        FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
 329       ITER_NARROW | ITER_SRC,
 330       ssse3_bilinear_cover_iter_init,
 331       NULL, NULL
 332     },
 333
 334     { PIXMAN_null },
 335 };
 336
 337 static const pixman_fast_path_t ssse3_fast_paths[] =
 338 {
 339     { PIXMAN_OP_NONE },
 340 };
 341
 342 pixman_implementation_t *
 343 _pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
 344 {
 345     pixman_implementation_t *imp =
 346         _pixman_implementation_create (fallback, ssse3_fast_paths);
 347
 348     imp->iter_info = ssse3_iters;
 349
 350     return imp;
 351 }