Hallo, dies ist ein Test.

PWD: /www/data-lst1/unixsoft/unixsoft/kaempfer/.public_html

Running in File Mode
Relative path: ./../../../../../../usr/include/graphene-1.0/graphene-simd4x4f.h
Real path: /usr/include/graphene-1.0/graphene-simd4x4f.h
Zurück

/* graphene-simd4x4f.h: 4x4 float vector operations
 *
 * SPDX-License-Identifier: MIT
 *
 * Copyright 2014  Emmanuele Bassi
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in
 * all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */

#pragma once

#include "graphene-simd4f.h"

#include <math.h>
#include <float.h>

GRAPHENE_BEGIN_DECLS

/**
 * graphene_simd4x4f_t:
 *
 * A SIMD-based matrix type that uses four #graphene_simd4f_t vectors.
 *
 * The matrix is treated as row-major, i.e. the x, y, z, and w vectors
 * are rows, and elements of each vector are a column:
 *
 * |[
 *   graphene_simd4x4f_t = {
 *     x.x, x.y, x.z, x.w,
 *     y.x, y.y, y.z, y.w,
 *     z.x, z.y, z.z, z.w,
 *     w.x, w.y, w.z, w.w
 *   }
 * ]|
 *
 * The contents of the #graphene_simd4x4f_t type are private and
 * cannot be accessed directly; use the provided API instead.
 *
 * Since: 1.0
 */

/**
 * graphene_simd4x4f_init:
 * @x: a #graphene_simd4f_t for the first row
 * @y: a #graphene_simd4f_t for the second row
 * @z: a #graphene_simd4f_t for the third row
 * @w: a #graphene_simd4f_t for the fourth row
 *
 * Creates a new #graphene_simd4x4f_t using the given row vectors
 * to initialize it.
 *
 * Returns: the newly created #graphene_simd4x4f_t
 *
 * Since: 1.0
 */
static inline graphene_simd4x4f_t GRAPHENE_VECTORCALL
graphene_simd4x4f_init (graphene_simd4f_t x,
                        graphene_simd4f_t y,
                        graphene_simd4f_t z,
                        graphene_simd4f_t w)
{
  graphene_simd4x4f_t s;

s.x = x;
  s.y = y;
  s.z = z;
  s.w = w;

return s;
}

/**
 * graphene_simd4x4f_init_identity:
 * @m: a #graphene_simd4x4f_t
 *
 * Initializes @m to be the identity matrix.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_init_identity (graphene_simd4x4f_t *m)
{
  *m = graphene_simd4x4f_init (graphene_simd4f_init (1.0f, 0.0f, 0.0f, 0.0f),
                               graphene_simd4f_init (0.0f, 1.0f, 0.0f, 0.0f),
                               graphene_simd4f_init (0.0f, 0.0f, 1.0f, 0.0f),
                               graphene_simd4f_init (0.0f, 0.0f, 0.0f, 1.0f));
}

/**
 * graphene_simd4x4f_init_from_float:
 * @m: a #graphene_simd4x4f_t
 * @f: (array fixed-size=16): an array of 16 floating point values
 *
 * Initializes a #graphene_simd4x4f_t with the given array
 * of floating point values.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_init_from_float (graphene_simd4x4f_t *m,
                                   const float         *f)
{
  m->x = graphene_simd4f_init_4f (f +  0);
  m->y = graphene_simd4f_init_4f (f +  4);
  m->z = graphene_simd4f_init_4f (f +  8);
  m->w = graphene_simd4f_init_4f (f + 12);
}

/**
 * graphene_simd4x4f_to_float:
 * @m: a #graphene_sidm4x4f_t
 * @v: (out caller-allocates) (array fixed-size=16): a floating
 *   point values vector capable of holding at least 16 values
 *
 * Copies the content of @m in a float array.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_to_float (const graphene_simd4x4f_t *m,
                            float                     *v)
{
  graphene_simd4f_dup_4f (m->x, v +  0);
  graphene_simd4f_dup_4f (m->y, v +  4);
  graphene_simd4f_dup_4f (m->z, v +  8);
  graphene_simd4f_dup_4f (m->w, v + 12);
}

GRAPHENE_AVAILABLE_IN_1_0
void    graphene_simd4x4f_transpose_in_place    (graphene_simd4x4f_t *s);

#if defined(GRAPHENE_USE_SSE)

#ifdef __GNUC__
#define graphene_simd4x4f_transpose_in_place(s) \
  (__extension__ ({ \
    _MM_TRANSPOSE4_PS ((s)->x, (s)->y, (s)->z, (s)->w); \
  }))
#elif defined (_MSC_VER)
#define graphene_simd4x4f_transpose_in_place(s) \
  _MM_TRANSPOSE4_PS ((s)->x, (s)->y, (s)->z, (s)->w)
#endif

#elif defined(GRAPHENE_USE_GCC)

#define graphene_simd4x4f_transpose_in_place(s) \
  (__extension__ ({ \
    const graphene_simd4f_t sx = (s)->x; \
    const graphene_simd4f_t sy = (s)->y; \
    const graphene_simd4f_t sz = (s)->z; \
    const graphene_simd4f_t sw = (s)->w; \
    (s)->x = graphene_simd4f_init (sx[0], sy[0], sz[0], sw[0]); \
    (s)->y = graphene_simd4f_init (sx[1], sy[1], sz[1], sw[1]); \
    (s)->z = graphene_simd4f_init (sx[2], sy[2], sz[2], sw[2]); \
    (s)->w = graphene_simd4f_init (sx[3], sy[3], sz[3], sw[3]); \
  }))

#elif defined(GRAPHENE_USE_ARM_NEON)

# ifdef __GNUC__

#define graphene_simd4x4f_transpose_in_place(s) \
  (__extension__ ({ \
    const graphene_simd4f_union_t sx = { (s)->x }; \
    const graphene_simd4f_union_t sy = { (s)->y }; \
    const graphene_simd4f_union_t sz = { (s)->z }; \
    const graphene_simd4f_union_t sw = { (s)->w }; \
    (s)->x = graphene_simd4f_init (sx.f[0], sy.f[0], sz.f[0], sw.f[0]); \
    (s)->y = graphene_simd4f_init (sx.f[1], sy.f[1], sz.f[1], sw.f[1]); \
    (s)->z = graphene_simd4f_init (sx.f[2], sy.f[2], sz.f[2], sw.f[2]); \
    (s)->w = graphene_simd4f_init (sx.f[3], sy.f[3], sz.f[3], sw.f[3]); \
  }))

# elif defined (_MSC_VER)

#define graphene_simd4x4f_transpose_in_place(s) _simd4x4f_transpose_in_place(s)
static inline void
_simd4x4f_transpose_in_place (graphene_simd4x4f_t *s)
{
  const graphene_simd4f_union_t sx = { (s)->x };
  const graphene_simd4f_union_t sy = { (s)->y };
  const graphene_simd4f_union_t sz = { (s)->z };
  const graphene_simd4f_union_t sw = { (s)->w };
  (s)->x = graphene_simd4f_init (sx.f[0], sy.f[0], sz.f[0], sw.f[0]);
  (s)->y = graphene_simd4f_init (sx.f[1], sy.f[1], sz.f[1], sw.f[1]);
  (s)->z = graphene_simd4f_init (sx.f[2], sy.f[2], sz.f[2], sw.f[2]);
  (s)->w = graphene_simd4f_init (sx.f[3], sy.f[3], sz.f[3], sw.f[3]);
}

# endif

#elif defined(GRAPHENE_USE_SCALAR)

#define graphene_simd4x4f_transpose_in_place(s) \
  (graphene_simd4x4f_transpose_in_place ((graphene_simd4x4f_t *) (s)))

#else
# error "No implementation for graphene_simd4x4f_t defined."
#endif

/**
 * graphene_simd4x4f_sum:
 * @a: a #graphene_simd4f_t
 * @res: (out): return location for the sum vector
 *
 * Adds all the row vectors of @a.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_sum (const graphene_simd4x4f_t *a,
                       graphene_simd4f_t         *res)
{
  graphene_simd4f_t s = graphene_simd4f_add (a->x, a->y);
  s = graphene_simd4f_add (s, a->z);
  s = graphene_simd4f_add (s, a->w);
  *res = s;
}

/**
 * graphene_simd4x4f_vec4_mul:
 * @a: a #graphene_simd4x4f_t
 * @b: a #graphene_simd4f_t
 * @res: (out): return location for a #graphene_simd4f_t
 *
 * Left multiplies the given #graphene_simd4x4f_t with the given
 * #graphene_simd4f_t row vector using a dot product:
 *
 * |[
 * res = b × A
 *
 *     = ⎡x⎤ ⎛ x.x  x.y  x.z  x.w ⎞
 *       ⎜y⎟ ⎜ y.x  y.y  y.z  y.w ⎟
 *       ⎜z⎟ ⎜ z.x  z.y  z.z  z.w ⎟
 *       ⎣w⎦ ⎝ w.x  w.y  w.z  w.w ⎠
 *
 *     = [ x.x × x   x.y × x   x.z × x   x.w × x ]
 *            +         +         +         +
 *       [ y.x × y   y.y × y   y.z × y   y.w × y ]
 *            +         +         +         +
 *       [ z.x × z   z.y × z   z.z × z   z.w × z ]
 *            +         +         +         +
 *       [ w.x × w   w.y × w   w.z × w   w.w × w ]
 *
 *     = ⎡ x.x × x + y.x × y + z.x × z + w.x × w ⎤
 *       ⎜ x.y × x + y.y × y + z.y × z + w.y × w ⎟
 *       ⎜ x.z × x + y.z × y + z.z × z + w.z × w ⎟
 *       ⎣ x.w × x + y.w × y + z.w × z + w.w × w ⎦
 * ]|
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_vec4_mul (const graphene_simd4x4f_t *a,
                            const graphene_simd4f_t   *b,
                            graphene_simd4f_t         *res)
{
  const graphene_simd4f_t v = *b;
  const graphene_simd4f_t v_x = graphene_simd4f_splat_x (v);
  const graphene_simd4f_t v_y = graphene_simd4f_splat_y (v);
  const graphene_simd4f_t v_z = graphene_simd4f_splat_z (v);
  const graphene_simd4f_t v_w = graphene_simd4f_splat_w (v);

*res = graphene_simd4f_add (graphene_simd4f_add (graphene_simd4f_mul (a->x, v_x),
                                                   graphene_simd4f_mul (a->y, v_y)),
                              graphene_simd4f_add (graphene_simd4f_mul (a->z, v_z),
                                                   graphene_simd4f_mul (a->w, v_w)));
}

/**
 * graphene_simd4x4f_vec3_mul:
 * @m: a #graphene_simd4x4f_t
 * @v: a #graphene_simd4f_t
 * @res: (out): return location for a #graphene_simd4f_t
 *
 * Left multiplies the given #graphene_simd4x4f_t with the given
 * #graphene_simd4f_t, using only the first three row vectors
 * of the matrix, and the first three components of the vector;
 * the W components of the matrix and vector are ignored:
 *
 * |[
 * res = b × A
 *
 *     = ⎡x⎤ ⎛ x.x  x.y  x.z ⎞
 *       ⎜y⎟ ⎜ y.x  y.y  y.z ⎟
 *       ⎣z⎦ ⎝ z.x  z.y  z.z ⎠
 *
 *     = [ x.x × x   x.y × x   x.z × x ]
 *            +         +         +
 *       [ y.x × y   y.y × y   y.z × y ]
 *            +         +         +
 *       [ z.x × z   z.y × z   z.z × z ]
 *
 *     = ⎡ x.x × x + y.x × y + z.x × z ⎤
 *       ⎜ x.y × x + y.y × y + z.y × z ⎟
 *       ⎜ x.z × x + y.z × y + z.z × z ⎟
 *       ⎣               0             ⎦
 * ]|
 *
 * See also: graphene_simd4x4f_vec4_mul(), graphene_simd4x4f_point3_mul()
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_vec3_mul (const graphene_simd4x4f_t *m,
                            const graphene_simd4f_t   *v,
                            graphene_simd4f_t         *res)
{
  const graphene_simd4f_t v_x = graphene_simd4f_splat_x (*v);
  const graphene_simd4f_t v_y = graphene_simd4f_splat_y (*v);
  const graphene_simd4f_t v_z = graphene_simd4f_splat_z (*v);
  graphene_simd4f_t r;

r = graphene_simd4f_add (graphene_simd4f_add (graphene_simd4f_mul (m->x, v_x),
                                                graphene_simd4f_mul (m->y, v_y)),
                           graphene_simd4f_mul (m->z, v_z));
  *res = graphene_simd4f_zero_w (r);
}

/**
 * graphene_simd4x4f_point3_mul:
 * @m: a #graphene_simd4x4f_t
 * @p: a #graphene_simd4f_t
 * @res: (out): return location for a #graphene_simd4f_t
 *
 * Multiplies the given #graphene_simd4x4f_t with the given
 * #graphene_simd4f_t.
 *
 * Unlike graphene_simd4x4f_vec3_mul(), this function will
 * use the W components of the matrix:
 *
 * |[
 * res = b × A
 *
 *     = ⎡x⎤ ⎛ x.x  x.y  x.z  x.w ⎞
 *       ⎜y⎟ ⎜ y.x  y.y  y.z  y.w ⎟
 *       ⎜z⎟ ⎜ z.x  z.y  z.z  z.w ⎟
 *       ⎣w⎦ ⎝ w.x  w.y  w.z  w.w ⎠
 *
 *     = [ x.x × x   x.y × x   x.z × x   x.w × x ]
 *            +         +         +         +
 *       [ y.x × y   y.y × y   y.z × y   y.w × y ]
 *            +         +         +         +
 *       [ z.x × z   z.y × z   z.z × z   z.w × z ]
 *            +         +         +         +
 *       [   w.x       w.y       w.z       w.w   ]
 *
 *     = ⎡ x.x × x + y.x × y + z.x × z + w.x ⎤
 *       ⎜ x.y × x + y.y × y + z.y × z + w.y ⎟
 *       ⎜ x.z × x + y.z × y + z.z × z + w.z ⎟
 *       ⎣ x.w × x + y.w × y + z.w × z + w.w ⎦
 * ]|
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_point3_mul (const graphene_simd4x4f_t *m,
                              const graphene_simd4f_t   *p,
                              graphene_simd4f_t         *res)
{
  const graphene_simd4f_t v = *p;
  const graphene_simd4f_t v_x = graphene_simd4f_splat_x (v);
  const graphene_simd4f_t v_y = graphene_simd4f_splat_y (v);
  const graphene_simd4f_t v_z = graphene_simd4f_splat_z (v);

*res = graphene_simd4f_add (graphene_simd4f_add (graphene_simd4f_mul (m->x, v_x),
                                                   graphene_simd4f_mul (m->y, v_y)),
                              graphene_simd4f_add (graphene_simd4f_mul (m->z, v_z),
                                                   m->w));
}

/**
 * graphene_simd4x4f_transpose:
 * @s: a #graphene_simd4x4f_t
 * @res: (out): return location for the transposed matrix
 *
 * Transposes the given #graphene_simd4x4f_t.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_transpose (const graphene_simd4x4f_t *s,
                             graphene_simd4x4f_t       *res)
{
  *res = *s;
  graphene_simd4x4f_transpose_in_place (res);
}

/**
 * graphene_simd4x4f_inv_ortho_vec3_mul:
 * @a: a #graphene_simd4x4f_t
 * @b: a #graphene_simd4f_t
 * @res: (out): return location for the transformed vector
 *
 * Performs the inverse orthographic transformation of the first
 * three components in the given vector, using the first three
 * row vectors of the given SIMD matrix.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_inv_ortho_vec3_mul (const graphene_simd4x4f_t *a,
                                      const graphene_simd4f_t   *b,
                                      graphene_simd4f_t         *res)
{
  graphene_simd4x4f_t transpose = *a;
  graphene_simd4f_t translation = *b;

transpose.w = graphene_simd4f_init (0.f, 0.f, 0.f, 0.f);
  graphene_simd4x4f_transpose_in_place (&transpose);

graphene_simd4x4f_vec3_mul (&transpose, &translation, res);
}

/**
 * graphene_simd4x4f_inv_ortho_point3_mul:
 * @a: a #graphene_simd4x4f_t
 * @b: a #graphene_simd4x4f_t
 * @res: (out): return location for the result vector
 *
 * Performs the inverse orthographic transformation of the first
 * three components in the given vector, using the given SIMD
 * matrix.
 *
 * Unlike graphene_simd4x4f_inv_ortho_vec3_mul(), this function
 * will also use the fourth row vector of the SIMD matrix.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_inv_ortho_point3_mul (const graphene_simd4x4f_t *a,
                                        const graphene_simd4f_t   *b,
                                        graphene_simd4f_t         *res)
{
  graphene_simd4f_t translation = graphene_simd4f_sub (*b, a->w);
  graphene_simd4x4f_t transpose = *a;

transpose.w = graphene_simd4f_init (0.f, 0.f, 0.f, 0.f);
  graphene_simd4x4f_transpose_in_place (&transpose);

graphene_simd4x4f_point3_mul (&transpose, &translation, res);
}

/**
 * graphene_simd4x4f_matrix_mul:
 * @a: a #graphene_simd4x4f_t
 * @b: a #graphene_simd4x4f_t
 * @res: (out): return location for the result
 *
 * Multiplies the two matrices, following the convention:
 *
 * |[
 *   res = A × B
 *
 *       = ⎡ A.x × B ⎤
 *         ⎜ A.y × B ⎟
 *         ⎜ A.z × B ⎟
 *         ⎣ A.w × B ⎦
 *
 *       = ⎡ res.x ⎤
 *         ⎜ res.y ⎟
 *         ⎜ res.z ⎟
 *         ⎣ res.w ⎦
 * ]|
 *
 * See also: graphene_simd4x4f_vec4_mul()
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_matrix_mul (const graphene_simd4x4f_t *a,
                              const graphene_simd4x4f_t *b,
                              graphene_simd4x4f_t       *res)
{
#if 0
  /* this is the classic naive A*B implementation of the row * column
   * matrix product. using a SIMD scalar implementation, it's fairly
   * slow at 329ns per multiplication; the SSE implementation makes it
   * about 10x faster, at 32ns; the GCC vector implementation is only
   * 5x faster, at 66ns. the biggest culprits are the transpose operation
   * and the multiple, one lane reads to compute the scalar sum.
   */
  graphene_simd4x4f_t t;

graphene_simd4x4f_transpose (b, &t);

res->x =
    graphene_simd4f_init (graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->x, t.x)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->x, t.y)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->x, t.z)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->x, t.w)));

res->y =
    graphene_simd4f_init (graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->y, t.x)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->y, t.y)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->y, t.z)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->y, t.w)));

res->z =
    graphene_simd4f_init (graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->z, t.x)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->z, t.y)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->z, t.z)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->z, t.w)));

res->w =
    graphene_simd4f_init (graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->w, t.x)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->w, t.y)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->w, t.z)),
                          graphene_simd4f_sum_scalar (graphene_simd4f_mul (a->w, t.w)));
#else
  /* this is an optimized version of the matrix multiplication, using
   * four dot products for each row vector. this yields drastically
   * better numbers while retaining the same correct results as above:
   * the scalar implementation now clocks at 91ns; the GCC vector
   * implementation is 19ns; and the SSE implementation is 16ns.
   *
   * the order is correct if we want to multiply A with B; remember
   * that matrix multiplication is non-commutative.
   */
  graphene_simd4f_t x, y, z, w;

graphene_simd4x4f_vec4_mul (b, &a->x, &x);
  graphene_simd4x4f_vec4_mul (b, &a->y, &y);
  graphene_simd4x4f_vec4_mul (b, &a->z, &z);
  graphene_simd4x4f_vec4_mul (b, &a->w, &w);

*res = graphene_simd4x4f_init (x, y, z, w);
#endif
}

/**
 * graphene_simd4x4f_init_perspective:
 * @m: a #graphene_simd4x4f_t
 * @fovy_rad: the angle of the field of vision, in radians
 * @aspect: the aspect value
 * @z_near: the depth of the near clipping plane
 * @z_far: the depth of the far clipping plane
 *
 * Initializes a #graphene_simd4x4f_t with a perspective projection.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_init_perspective (graphene_simd4x4f_t *m,
                                    float                fovy_rad,
                                    float                aspect,
                                    float                z_near,
                                    float                z_far)
{
  float delta_z = z_far - z_near;
  float cotangent = tanf (GRAPHENE_PI_2 - fovy_rad * 0.5f);

float a = cotangent / aspect;
  float b = cotangent;
  float c = -(z_far + z_near) / delta_z;
  float d = -2 * z_near * z_far / delta_z;

m->x = graphene_simd4f_init (   a, 0.0f, 0.0f,  0.0f);
  m->y = graphene_simd4f_init (0.0f,    b, 0.0f,  0.0f);
  m->z = graphene_simd4f_init (0.0f, 0.0f,    c, -1.0f);
  m->w = graphene_simd4f_init (0.0f, 0.0f,    d,  0.0f);
}

/**
 * graphene_simd4x4f_init_ortho:
 * @m: a #graphene_simd4x4f_t
 * @left: edge of the left clipping plane
 * @right: edge of the right clipping plane
 * @bottom: edge of the bottom clipping plane
 * @top: edge of the top clipping plane
 * @z_near: depth of the near clipping plane
 * @z_far: depth of the far clipping plane
 *
 * Initializes the given SIMD matrix with an orthographic projection.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_init_ortho (graphene_simd4x4f_t *m,
                              float                left,
                              float                right,
                              float                bottom,
                              float                top,
                              float                z_near,
                              float                z_far)
{
  float delta_x = right - left;
  float delta_y = top - bottom;
  float delta_z = z_far - z_near;

float a = 2.0f / delta_x;
  float b = -(right + left) / delta_x;
  float c = 2.0f / delta_y;
  float d = -(top + bottom) / delta_y;
  float e = -2.0f / delta_z;
  float f = -(z_far + z_near) / delta_z;

m->x = graphene_simd4f_init (   a, 0.0f, 0.0f, 0.0f);
  m->y = graphene_simd4f_init (0.0f,    c, 0.0f, 0.0f);
  m->z = graphene_simd4f_init (0.0f, 0.0f,    e, 0.0f);
  m->w = graphene_simd4f_init (   b,    d,    f, 1.0f);
}

/**
 * graphene_simd4x4f_init_look_at:
 * @m: a #graphene_simd4x4f_t
 * @eye: vector for the camera coordinates
 * @center: vector for the object coordinates
 * @up: vector for the upwards direction
 *
 * Initializes a SIMD matrix with the projection necessary for
 * the camera at the @eye coordinates to look at the object at
 * the @center coordinates. The top of the camera is aligned to
 * the @up vector.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_init_look_at (graphene_simd4x4f_t *m,
                                graphene_simd4f_t    eye,
                                graphene_simd4f_t    center,
                                graphene_simd4f_t    up)
{
  const graphene_simd4f_t direction = graphene_simd4f_sub (center, eye);
  graphene_simd4f_t cross;
  graphene_simd4f_t z_axis;
  graphene_simd4f_t x_axis;
  graphene_simd4f_t y_axis;
  float eye_v[4];

if (graphene_simd4f_get_x (graphene_simd4f_dot3 (direction, direction)) < FLT_EPSILON)
    /* eye and center are in the same position */
    z_axis = graphene_simd4f_init (0, 0, 1, 0);
  else
    z_axis = graphene_simd4f_normalize3 (direction);

cross = graphene_simd4f_cross3 (z_axis, up);
  if (graphene_simd4f_get_x (graphene_simd4f_dot3 (cross, cross)) < FLT_EPSILON)
    {
      graphene_simd4f_t tweak_z;

/* up and z_axis are parallel */
      if (fabs (graphene_simd4f_get_z (up) - 1.0) < FLT_EPSILON)
        tweak_z = graphene_simd4f_init (0.0001f, 0, 0, 0);
      else
        tweak_z = graphene_simd4f_init (0, 0, 0.0001f, 0);

z_axis = graphene_simd4f_add (z_axis, tweak_z);
      z_axis = graphene_simd4f_normalize3 (z_axis);
      cross = graphene_simd4f_cross3 (z_axis, up);
    }

x_axis = graphene_simd4f_normalize3 (cross);
  y_axis = graphene_simd4f_cross3 (x_axis, z_axis);

graphene_simd4f_dup_4f (eye, eye_v);

m->x = x_axis;
  m->y = y_axis;
  m->z = graphene_simd4f_neg (z_axis);
  m->w = graphene_simd4f_init (-eye_v[0], -eye_v[1], -eye_v[2], 1.f);
}

/**
 * graphene_simd4x4f_init_frustum:
 * @m: a #graphene_simd4x4f_t
 * @left: distance of the left clipping plane
 * @right: distance of the right clipping plane
 * @bottom: distance of the bottom clipping plane
 * @top: distance of the top clipping plane
 * @z_near: distance of the near clipping plane
 * @z_far: distance of the far clipping plane
 *
 * Initializes a SIMD matrix with a frustum described by the distances
 * of six clipping planes.
 *
 * Since: 1.2
 */
static inline void
graphene_simd4x4f_init_frustum (graphene_simd4x4f_t *m,
                                float                left,
                                float                right,
                                float                bottom,
                                float                top,
                                float                z_near,
                                float                z_far)
{
  float x = 2.f * z_near / (right - left);
  float y = 2.f * z_near / (top - bottom);

float a = (right + left) / (right - left);
  float b = (top + bottom) / (top - bottom);
  float c = -1.f * (z_far + z_near) / (z_far - z_near);
  float d = -2.f * z_far * z_near / (z_far - z_near);

m->x = graphene_simd4f_init (  x, 0.f, 0.f,  0.f);
  m->y = graphene_simd4f_init (0.f,   y, 0.f,  0.f);
  m->z = graphene_simd4f_init (  a,   b,   c, -1.f);
  m->w = graphene_simd4f_init (0.f, 0.f,   d,  0.f);
}

/**
 * graphene_simd4x4f_perspective:
 * @m: a #graphene_simd4x4f_t
 * @depth: depth of the perspective
 *
 * Adds a perspective transformation for the given @depth.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_perspective (graphene_simd4x4f_t *m,
                               float                depth)
{
#if 1
  const float m_xw = graphene_simd4f_get_w (m->x);
  const float m_yw = graphene_simd4f_get_w (m->y);
  const float m_zw = graphene_simd4f_get_w (m->z);
  const float m_ww = graphene_simd4f_get_w (m->w);

const float p0 = graphene_simd4f_get_z (m->x) + -1.0f / depth * m_xw;
  const float p1 = graphene_simd4f_get_z (m->y) + -1.0f / depth * m_yw;
  const float p2 = graphene_simd4f_get_z (m->z) + -1.0f / depth * m_zw;
  const float p3 = graphene_simd4f_get_z (m->w) + -1.0f / depth * m_ww;

const graphene_simd4f_t p_x = graphene_simd4f_merge_w (m->x, m_xw + p0);
  const graphene_simd4f_t p_y = graphene_simd4f_merge_w (m->y, m_yw + p1);
  const graphene_simd4f_t p_z = graphene_simd4f_merge_w (m->z, m_zw + p2);
  const graphene_simd4f_t p_w = graphene_simd4f_merge_w (m->w, m_ww + p3);
#else
  /* this is equivalent to the operations above, but trying to inline
   * them into SIMD registers as much as possible by transposing the
   * original matrix and operating on the resulting column vectors. it
   * should warrant a micro benchmark, because while the above code is
   * dominated by single channel reads, the code below has a transpose
   * operation.
   */
  graphene_simd4x4f_t t;
  const graphene_simd4f_t f, p;
  const graphene_simd4f_t p_x, p_y, p_z, p_w;

graphene_simd4x4f_transpose (m, &t);

f = graphene_simd4f_neg (graphene_simd4f_reciprocal (graphene_simd4f_splat (depth)));
  p = graphene_simd4f_sum (t.w, graphene_simd4f_sum (t.z, graphene_simd4f_mul (f, t.w)));
  p_x = graphene_simd4f_merge_w (m->x, graphene_simd4f_get_x (p));
  p_y = graphene_simd4f_merge_w (m->y, graphene_simd4f_get_y (p));
  p_z = graphene_simd4f_merge_w (m->z, graphene_simd4f_get_z (p));
  p_w = graphene_simd4f_merge_w (m->w, graphene_simd4f_get_w (p));
#endif

*m = graphene_simd4x4f_init (p_x, p_y, p_z, p_w);
}

/**
 * graphene_simd4x4f_translation:
 * @m: a #graphene_simd4x4f_t
 * @x: coordinate of the X translation
 * @y: coordinate of the Y translation
 * @z: coordinate of the Z translation
 *
 * Initializes @m to contain a translation to the given coordinates.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_translation (graphene_simd4x4f_t *m,
                               float                x,
                               float                y,
                               float                z)
{
  *m = graphene_simd4x4f_init (graphene_simd4f_init (1.0f, 0.0f, 0.0f, 0.0f),
                               graphene_simd4f_init (0.0f, 1.0f, 0.0f, 0.0f),
                               graphene_simd4f_init (0.0f, 0.0f, 1.0f, 0.0f),
                               graphene_simd4f_init (   x,    y,    z, 1.0f));
}

/**
 * graphene_simd4x4f_scale:
 * @m: a #graphene_simd4x4f_t
 * @x: scaling factor on the X axis
 * @y: scaling factor on the Y axis
 * @z: scaling factor on the Z axis
 *
 * Initializes @m to contain a scaling transformation with the
 * given factors.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_scale (graphene_simd4x4f_t *m,
                         float                x,
                         float                y,
                         float                z)
{
  *m = graphene_simd4x4f_init (graphene_simd4f_init (   x, 0.0f, 0.0f, 0.0f),
                               graphene_simd4f_init (0.0f,    y, 0.0f, 0.0f),
                               graphene_simd4f_init (0.0f, 0.0f,    z, 0.0f),
                               graphene_simd4f_init (0.0f, 0.0f, 0.0f, 1.0f));

}

/**
 * graphene_simd4x4f_rotation:
 * @m: a #graphene_simd4x4f_t
 * @rad: the rotation, in radians
 * @axis: the vector of the axis of rotation
 *
 * Initializes @m to contain a rotation of the given angle
 * along the given axis.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_rotation (graphene_simd4x4f_t *m,
                            float                rad,
                            graphene_simd4f_t    axis)
{
  float sine, cosine;
  float x, y, z;
  float ab, bc, ca;
  float tx, ty, tz;
  graphene_simd4f_t i, j, k;

rad = -rad;
  axis = graphene_simd4f_normalize3 (axis);

/* We cannot use graphene_sincos() because it's a private function, whereas
   * graphene-simd4x4f.h is a public header
   */
  sine = sinf (rad);
  cosine = cosf (rad);

x = graphene_simd4f_get_x (axis);
  y = graphene_simd4f_get_y (axis);
  z = graphene_simd4f_get_z (axis);

ab = x * y * (1.0f - cosine);
  bc = y * z * (1.0f - cosine);
  ca = z * x * (1.0f - cosine);

tx = x * x;
  ty = y * y;
  tz = z * z;

i = graphene_simd4f_init (tx + cosine * (1.0f - tx), ab - z * sine, ca + y * sine, 0.f);
  j = graphene_simd4f_init (ab + z * sine, ty + cosine * (1.0f - ty), bc - x * sine, 0.f);
  k = graphene_simd4f_init (ca - y * sine, bc + x * sine, tz + cosine * (1.0f - tz), 0.f);

*m = graphene_simd4x4f_init (i, j, k, graphene_simd4f_init (0.0f, 0.0f, 0.0f, 1.0f));
}

/**
 * graphene_simd4x4f_add:
 * @a: a #graphene_simd4x4f_t
 * @b: a #graphene_simd4x4f_t
 * @res: (out caller-allocates): return location for a #graphene_simd4x4f_t
 *
 * Adds each row vector of @a and @b and places the results in @res.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_add (const graphene_simd4x4f_t *a,
                       const graphene_simd4x4f_t *b,
                       graphene_simd4x4f_t *res)
{
  res->x = graphene_simd4f_add (a->x, b->x);
  res->y = graphene_simd4f_add (a->y, b->y);
  res->z = graphene_simd4f_add (a->z, b->z);
  res->w = graphene_simd4f_add (a->w, b->w);
}

/**
 * graphene_simd4x4f_sub:
 * @a: a #graphene_simd4x4f_t
 * @b: a #graphene_simd4x4f_t
 * @res: (out caller-allocates): return location for a #graphene_simd4x4f_t
 *
 * Subtracts each row vector of @a and @b and places the results in @res.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_sub (const graphene_simd4x4f_t *a,
                       const graphene_simd4x4f_t *b,
                       graphene_simd4x4f_t *res)
{
  res->x = graphene_simd4f_sub (a->x, b->x);
  res->y = graphene_simd4f_sub (a->y, b->y);
  res->z = graphene_simd4f_sub (a->z, b->z);
  res->w = graphene_simd4f_sub (a->w, b->w);
}

/**
 * graphene_simd4x4f_mul:
 * @a: a #graphene_simd4x4f_t
 * @b: a #graphene_simd4x4f_t
 * @res: (out caller-allocates): return location for a #graphene_simd4x4f_t
 *
 * Multiplies each row vector of @a and @b and places the results in @res.
 *
 * You most likely want graphene_simd4x4f_matrix_mul() instead.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_mul (const graphene_simd4x4f_t *a,
                       const graphene_simd4x4f_t *b,
                       graphene_simd4x4f_t *res)
{
  res->x = graphene_simd4f_mul (a->x, b->x);
  res->y = graphene_simd4f_mul (a->y, b->y);
  res->z = graphene_simd4f_mul (a->z, b->z);
  res->w = graphene_simd4f_mul (a->w, b->w);
}

/**
 * graphene_simd4x4f_div:
 * @a: a #graphene_simd4x4f_t
 * @b: a #graphene_simd4x4f_t
 * @res: (out caller-allocates): return location for a #graphene_simd4x4f_t
 *
 * Divides each row vector of @a and @b and places the results in @res.
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_div (const graphene_simd4x4f_t *a,
                       const graphene_simd4x4f_t *b,
                       graphene_simd4x4f_t *res)
{
  res->x = graphene_simd4f_div (a->x, b->x);
  res->y = graphene_simd4f_div (a->y, b->y);
  res->z = graphene_simd4f_div (a->z, b->z);
  res->w = graphene_simd4f_div (a->w, b->w);
}

/**
 * graphene_simd4x4f_inverse:
 * @m: a #graphene_simd4x4f_t
 * @res: (out): return location for the inverse matrix
 *
 * Inverts the given #graphene_simd4x4f_t.
 *
 * Returns: `true` if the matrix was invertible
 *
 * Since: 1.0
 */
static inline bool
graphene_simd4x4f_inverse (const graphene_simd4x4f_t *m,
                           graphene_simd4x4f_t       *res)
{
  /* split rows */
  const graphene_simd4f_t r0 = m->x;
  const graphene_simd4f_t r1 = m->y;
  const graphene_simd4f_t r2 = m->z;
  const graphene_simd4f_t r3 = m->w;

/* cofactors */
  const graphene_simd4f_t r0_wxyz = graphene_simd4f_shuffle_wxyz (r0);
  const graphene_simd4f_t r0_zwxy = graphene_simd4f_shuffle_zwxy (r0);
  const graphene_simd4f_t r0_yzwx = graphene_simd4f_shuffle_yzwx (r0);

const graphene_simd4f_t r1_wxyz = graphene_simd4f_shuffle_wxyz (r1);
  const graphene_simd4f_t r1_zwxy = graphene_simd4f_shuffle_zwxy (r1);
  const graphene_simd4f_t r1_yzwx = graphene_simd4f_shuffle_yzwx (r1);

const graphene_simd4f_t r2_wxyz = graphene_simd4f_shuffle_wxyz (r2);
  const graphene_simd4f_t r2_zwxy = graphene_simd4f_shuffle_zwxy (r2);
  const graphene_simd4f_t r2_yzwx = graphene_simd4f_shuffle_yzwx (r2);

const graphene_simd4f_t r3_wxyz = graphene_simd4f_shuffle_wxyz (r3);
  const graphene_simd4f_t r3_zwxy = graphene_simd4f_shuffle_zwxy (r3);
  const graphene_simd4f_t r3_yzwx = graphene_simd4f_shuffle_yzwx (r3);

const graphene_simd4f_t r0_wxyz_x_r1 = graphene_simd4f_mul (r0_wxyz, r1);
  const graphene_simd4f_t r0_wxyz_x_r1_yzwx = graphene_simd4f_mul (r0_wxyz, r1_yzwx);
  const graphene_simd4f_t r0_wxyz_x_r1_zwxy = graphene_simd4f_mul (r0_wxyz, r1_zwxy);

const graphene_simd4f_t r2_wxyz_x_r3 = graphene_simd4f_mul (r2_wxyz, r3);
  const graphene_simd4f_t r2_wxyz_x_r3_yzwx = graphene_simd4f_mul (r2_wxyz, r3_yzwx);
  const graphene_simd4f_t r2_wxyz_x_r3_zwxy = graphene_simd4f_mul (r2_wxyz, r3_zwxy);

const graphene_simd4f_t ar1 = graphene_simd4f_sub (graphene_simd4f_shuffle_wxyz (r2_wxyz_x_r3_zwxy),
                                                     graphene_simd4f_shuffle_zwxy (r2_wxyz_x_r3));
  const graphene_simd4f_t ar2 = graphene_simd4f_sub (graphene_simd4f_shuffle_zwxy (r2_wxyz_x_r3_yzwx),
                                                     r2_wxyz_x_r3_yzwx);
  const graphene_simd4f_t ar3 = graphene_simd4f_sub (r2_wxyz_x_r3_zwxy,
                                                     graphene_simd4f_shuffle_wxyz (r2_wxyz_x_r3));

const graphene_simd4f_t br1 = graphene_simd4f_sub (graphene_simd4f_shuffle_wxyz (r0_wxyz_x_r1_zwxy),
                                                     graphene_simd4f_shuffle_zwxy (r0_wxyz_x_r1));
  const graphene_simd4f_t br2 = graphene_simd4f_sub (graphene_simd4f_shuffle_zwxy (r0_wxyz_x_r1_yzwx),
                                                     r0_wxyz_x_r1_yzwx);
  const graphene_simd4f_t br3 = graphene_simd4f_sub (r0_wxyz_x_r1_zwxy,
                                                     graphene_simd4f_shuffle_wxyz (r0_wxyz_x_r1));

const graphene_simd4f_t r0_sum =
    graphene_simd4f_madd (r0_yzwx, ar3,
                          graphene_simd4f_madd (r0_zwxy, ar2,
                                                graphene_simd4f_mul (r0_wxyz, ar1)));
  const graphene_simd4f_t r1_sum =
    graphene_simd4f_madd (r1_wxyz, ar1,
                          graphene_simd4f_madd (r1_zwxy, ar2,
                                                graphene_simd4f_mul (r1_yzwx, ar3)));
  const graphene_simd4f_t r2_sum =
    graphene_simd4f_madd (r2_yzwx, br3,
                          graphene_simd4f_madd (r2_zwxy, br2,
                                                graphene_simd4f_mul (r2_wxyz, br1)));
  const graphene_simd4f_t r3_sum =
    graphene_simd4f_madd (r3_yzwx, br3,
                          graphene_simd4f_madd (r3_zwxy, br2,
                                                graphene_simd4f_mul (r3_wxyz, br1)));

/* determinant and its inverse */
  const graphene_simd4f_t d0 = graphene_simd4f_mul (r1_sum, r0);
  const graphene_simd4f_t d1 = graphene_simd4f_add (d0, graphene_simd4f_merge_high (d0, d0));
  const graphene_simd4f_t det = graphene_simd4f_sub (d1, graphene_simd4f_splat_y (d1));
  if (fabsf (graphene_simd4f_get_x (det)) >= FLT_EPSILON)
    {
      const graphene_simd4f_t invdet = graphene_simd4f_splat_x (graphene_simd4f_div (graphene_simd4f_splat (1.0f), det));

const graphene_simd4f_t o0 = graphene_simd4f_mul (graphene_simd4f_flip_sign_0101 (r1_sum), invdet);
      const graphene_simd4f_t o1 = graphene_simd4f_mul (graphene_simd4f_flip_sign_1010 (r0_sum), invdet);
      const graphene_simd4f_t o2 = graphene_simd4f_mul (graphene_simd4f_flip_sign_0101 (r3_sum), invdet);
      const graphene_simd4f_t o3 = graphene_simd4f_mul (graphene_simd4f_flip_sign_1010 (r2_sum), invdet);

graphene_simd4x4f_t mt = graphene_simd4x4f_init (o0, o1, o2, o3);

/* transpose the resulting matrix */
      graphene_simd4x4f_transpose (&mt, res);

return true;
    }

return false;
}

/**
 * graphene_simd4x4f_determinant:
 * @m: a #graphene_simd4x4f_t
 * @det_r: (out): return location for the matrix determinant
 * @invdet_r: (out): return location for the inverse of the matrix
 *   determinant
 *
 * Computes the determinant (and its inverse) of the given matrix
 *
 * Since: 1.0
 */
static inline void
graphene_simd4x4f_determinant (const graphene_simd4x4f_t *m,
                               graphene_simd4f_t         *det_r,
                               graphene_simd4f_t         *invdet_r)
{
  /* split rows */
  const graphene_simd4f_t r0 = m->x;
  const graphene_simd4f_t r1 = m->y;
  const graphene_simd4f_t r2 = m->z;
  const graphene_simd4f_t r3 = m->w;

/* cofactors */
  const graphene_simd4f_t r1_wxyz = graphene_simd4f_shuffle_wxyz (r1);
  const graphene_simd4f_t r1_zwxy = graphene_simd4f_shuffle_zwxy (r1);
  const graphene_simd4f_t r1_yzwx = graphene_simd4f_shuffle_yzwx (r1);

const graphene_simd4f_t r2_wxyz = graphene_simd4f_shuffle_wxyz (r2);

const graphene_simd4f_t r3_zwxy = graphene_simd4f_shuffle_zwxy (r3);
  const graphene_simd4f_t r3_yzwx = graphene_simd4f_shuffle_yzwx (r3);

const graphene_simd4f_t r1_sum =
    graphene_simd4f_madd (r1_wxyz, ar1,
                          graphene_simd4f_madd (r1_zwxy, ar2,
                                                graphene_simd4f_mul (r1_yzwx, ar3)));

/* determinant and its inverse */
  const graphene_simd4f_t d0 = graphene_simd4f_mul (r1_sum, r0);
  const graphene_simd4f_t d1 = graphene_simd4f_add (d0, graphene_simd4f_merge_high (d0, d0));

const graphene_simd4f_t det = graphene_simd4f_sub (d1, graphene_simd4f_splat_y (d1));

const graphene_simd4f_t invdet = graphene_simd4f_splat_x (graphene_simd4f_div (graphene_simd4f_splat (1.0f), det));

if (det_r != NULL)
    *det_r = det;

if (invdet_r != NULL)
    *invdet_r = invdet;
}

/**
 * graphene_simd4x4f_is_identity:
 * @m: a #graphene_simd4x4f_t
 *
 * Checks whether the given matrix is the identity matrix.
 *
 * Returns: `true` if the matrix is the identity matrix
 *
 * Since: 1.0
 */
static inline bool
graphene_simd4x4f_is_identity (const graphene_simd4x4f_t *m)
{
  const graphene_simd4f_t r0 = graphene_simd4f_init (1.0f, 0.0f, 0.0f, 0.0f);
  const graphene_simd4f_t r1 = graphene_simd4f_init (0.0f, 1.0f, 0.0f, 0.0f);
  const graphene_simd4f_t r2 = graphene_simd4f_init (0.0f, 0.0f, 1.0f, 0.0f);
  const graphene_simd4f_t r3 = graphene_simd4f_init (0.0f, 0.0f, 0.0f, 1.0f);

return graphene_simd4f_cmp_eq (m->x, r0) &&
         graphene_simd4f_cmp_eq (m->y, r1) &&
         graphene_simd4f_cmp_eq (m->z, r2) &&
         graphene_simd4f_cmp_eq (m->w, r3);
}

/**
 * graphene_simd4x4f_is_2d:
 * @m: a #graphene_simd4x4f_t
 *
 * Checks whether the given matrix is compatible with an affine
 * transformation matrix.
 *
 * Returns: `true` if the matrix is compatible with an affine
 *   transformation matrix
 *
 * Since: 1.0
 */
static inline bool
graphene_simd4x4f_is_2d (const graphene_simd4x4f_t *m)
{
  float f[4];

if (!(fabsf (graphene_simd4f_get_z (m->x)) < FLT_EPSILON && fabsf (graphene_simd4f_get_w (m->x)) < FLT_EPSILON))
    return false;

if (!(fabsf (graphene_simd4f_get_z (m->y)) < FLT_EPSILON && fabsf (graphene_simd4f_get_w (m->y)) < FLT_EPSILON))
    return false;

graphene_simd4f_dup_4f (m->z, f);
  if (!(fabsf (f[0]) < FLT_EPSILON &&
        fabsf (f[1]) < FLT_EPSILON &&
        1.f - fabsf (f[2]) < FLT_EPSILON &&
        fabsf (f[3]) < FLT_EPSILON))
    return false;

if (!(fabsf (graphene_simd4f_get_z (m->w)) < FLT_EPSILON && 1.f - fabsf (graphene_simd4f_get_w (m->w)) < FLT_EPSILON))
    return false;

return true;
}

GRAPHENE_END_DECLS