17 #ifndef BT_SIMD__QUATERNION_H_
18 #define BT_SIMD__QUATERNION_H_
31 #define vOnes (_mm_set_ps(1.0f, 1.0f, 1.0f, 1.0f))
35 #if defined(BT_USE_SSE)
37 #define vQInv (_mm_set_ps(+0.0f, -0.0f, -0.0f, -0.0f))
38 #define vPPPM (_mm_set_ps(-0.0f, +0.0f, +0.0f, +0.0f))
40 #elif defined(BT_USE_NEON)
53 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))|| defined(BT_USE_NEON)
63 mVec128 = rhs.mVec128;
96 #ifndef BT_EULER_DEFAULT_ZYX
128 setValue(cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
129 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
130 sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
131 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
148 setValue(sinRoll * cosPitch * cosYaw - cosRoll * sinPitch * sinYaw,
149 cosRoll * sinPitch * cosYaw + sinRoll * cosPitch * sinYaw,
150 cosRoll * cosPitch * sinYaw - sinRoll * sinPitch * cosYaw,
151 cosRoll * cosPitch * cosYaw + sinRoll * sinPitch * sinYaw);
157 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
158 mVec128 = _mm_add_ps(mVec128, q.mVec128);
159 #elif defined(BT_USE_NEON)
160 mVec128 = vaddq_f32(mVec128, q.mVec128);
174 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
175 mVec128 = _mm_sub_ps(mVec128, q.mVec128);
176 #elif defined(BT_USE_NEON)
177 mVec128 = vsubq_f32(mVec128, q.mVec128);
191 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
192 __m128 vs = _mm_load_ss(&s);
193 vs = bt_pshufd_ps(vs, 0);
194 mVec128 = _mm_mul_ps(mVec128, vs);
195 #elif defined(BT_USE_NEON)
196 mVec128 = vmulq_n_f32(mVec128, s);
211 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
212 __m128 vQ2 = q.get128();
214 __m128 A1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(0,1,2,0));
215 __m128 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
219 __m128 A2 = bt_pshufd_ps(mVec128, BT_SHUFFLE(1,2,0,1));
220 __m128 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
224 B1 = bt_pshufd_ps(mVec128, BT_SHUFFLE(2,0,1,2));
225 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
229 mVec128 = bt_splat_ps(mVec128, 3);
230 mVec128 = mVec128 * vQ2;
233 mVec128 = mVec128 - B1;
234 A1 = _mm_xor_ps(A1, vPPPM);
235 mVec128 = mVec128+ A1;
237 #elif defined(BT_USE_NEON)
239 float32x4_t vQ1 = mVec128;
240 float32x4_t vQ2 = q.get128();
241 float32x4_t A0, A1, B1, A2, B2, A3, B3;
242 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
246 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
249 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
252 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
254 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
256 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
257 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
259 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
260 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
262 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
263 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
265 A3 = vcombine_f32(vQ1zx, vQ1yz);
266 B3 = vcombine_f32(vQ2yz, vQ2xz);
268 A1 = vmulq_f32(A1, B1);
269 A2 = vmulq_f32(A2, B2);
270 A3 = vmulq_f32(A3, B3);
271 A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);
273 A1 = vaddq_f32(A1, A2);
274 A0 = vsubq_f32(A0, A3);
277 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
278 A0 = vaddq_f32(A0, A1);
294 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
297 vd = _mm_mul_ps(mVec128, q.mVec128);
299 __m128 t = _mm_movehl_ps(vd, vd);
300 vd = _mm_add_ps(vd, t);
301 t = _mm_shuffle_ps(vd, vd, 0x55);
302 vd = _mm_add_ss(vd, t);
304 return _mm_cvtss_f32(vd);
305 #elif defined(BT_USE_NEON)
306 float32x4_t vd = vmulq_f32(mVec128, q.mVec128);
307 float32x2_t
x = vpadd_f32(vget_low_f32(vd), vget_high_f32(vd));
309 return vget_lane_f32(x, 0);
334 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
337 vd = _mm_mul_ps(mVec128, mVec128);
339 __m128 t = _mm_movehl_ps(vd, vd);
340 vd = _mm_add_ps(vd, t);
341 t = _mm_shuffle_ps(vd, vd, 0x55);
342 vd = _mm_add_ss(vd, t);
344 vd = _mm_sqrt_ss(vd);
345 vd = _mm_div_ss(vOnes, vd);
346 vd = bt_pshufd_ps(vd, 0);
347 mVec128 = _mm_mul_ps(mVec128, vd);
360 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
361 __m128 vs = _mm_load_ss(&s);
362 vs = bt_pshufd_ps(vs, 0x00);
365 #elif defined(BT_USE_NEON)
442 return btVector3(m_floats[0] * s, m_floats[1] * s, m_floats[2] * s);
448 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
450 #elif defined(BT_USE_NEON)
451 return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)vQInv));
462 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
464 #elif defined(BT_USE_NEON)
477 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
479 #elif defined(BT_USE_NEON)
491 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
493 #elif defined(BT_USE_NEON)
494 return btQuaternion((btSimdFloat4)veorq_s32((int32x4_t)mVec128, (int32x4_t)btvMzeroMask) );
506 if( diff.
dot(diff) > sum.
dot(sum) )
517 if( diff.
dot(diff) < sum.
dot(sum) )
574 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
575 __m128 vQ1 = q1.get128();
576 __m128 vQ2 = q2.get128();
577 __m128 A0, A1, B1, A2, B2;
579 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0));
580 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
584 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
585 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
589 B1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
590 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
594 A0 = bt_splat_ps(vQ1, 3);
600 A1 = _mm_xor_ps(A1, vPPPM);
605 #elif defined(BT_USE_NEON)
607 float32x4_t vQ1 = q1.get128();
608 float32x4_t vQ2 = q2.get128();
609 float32x4_t A0, A1, B1, A2, B2, A3, B3;
610 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
614 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
617 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
620 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
622 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
624 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
625 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
627 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
628 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
630 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
631 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
633 A3 = vcombine_f32(vQ1zx, vQ1yz);
634 B3 = vcombine_f32(vQ2yz, vQ2xz);
636 A1 = vmulq_f32(A1, B1);
637 A2 = vmulq_f32(A2, B2);
638 A3 = vmulq_f32(A3, B3);
639 A0 = vmulq_lane_f32(vQ2, vget_high_f32(vQ1), 1);
641 A1 = vaddq_f32(A1, A2);
642 A0 = vsubq_f32(A0, A3);
645 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
646 A0 = vaddq_f32(A0, A1);
652 q1.
w() * q2.
x() + q1.
x() * q2.
w() + q1.
y() * q2.
z() - q1.
z() * q2.
y(),
653 q1.
w() * q2.
y() + q1.
y() * q2.
w() + q1.
z() * q2.
x() - q1.
x() * q2.
z(),
654 q1.
w() * q2.
z() + q1.
z() * q2.
w() + q1.
x() * q2.
y() - q1.
y() * q2.
x(),
655 q1.
w() * q2.
w() - q1.
x() * q2.
x() - q1.
y() * q2.
y() - q1.
z() * q2.
z());
662 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
663 __m128 vQ1 = q.get128();
664 __m128 vQ2 = w.get128();
665 __m128 A1, B1, A2, B2, A3, B3;
667 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(3,3,3,0));
668 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(0,1,2,0));
672 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
673 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
677 A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
678 B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
683 A1 = _mm_xor_ps(A1, vPPPM);
688 #elif defined(BT_USE_NEON)
690 float32x4_t vQ1 = q.get128();
691 float32x4_t vQ2 = w.get128();
692 float32x4_t A1, B1, A2, B2, A3, B3;
693 float32x2_t vQ1wx, vQ2zx, vQ1yz, vQ2yz, vQ1zx, vQ2xz;
695 vQ1wx = vext_f32(vget_high_f32(vQ1), vget_low_f32(vQ1), 1);
699 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
702 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
706 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
708 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
709 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
711 A1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ1), 1), vQ1wx);
712 B1 = vcombine_f32(vget_low_f32(vQ2), vQ2zx);
714 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
715 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
717 A3 = vcombine_f32(vQ1zx, vQ1yz);
718 B3 = vcombine_f32(vQ2yz, vQ2xz);
720 A1 = vmulq_f32(A1, B1);
721 A2 = vmulq_f32(A2, B2);
722 A3 = vmulq_f32(A3, B3);
724 A1 = vaddq_f32(A1, A2);
727 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
729 A1 = vsubq_f32(A1, A3);
735 q.
w() * w.
x() + q.
y() * w.
z() - q.
z() * w.
y(),
736 q.
w() * w.
y() + q.
z() * w.
x() - q.
x() * w.
z(),
737 q.
w() * w.
z() + q.
x() * w.
y() - q.
y() * w.
x(),
738 -q.
x() * w.
x() - q.
y() * w.
y() - q.
z() * w.
z());
745 #if defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
746 __m128 vQ1 = w.get128();
747 __m128 vQ2 = q.get128();
748 __m128 A1, B1, A2, B2, A3, B3;
750 A1 = bt_pshufd_ps(vQ1, BT_SHUFFLE(0,1,2,0));
751 B1 = bt_pshufd_ps(vQ2, BT_SHUFFLE(3,3,3,0));
755 A2 = bt_pshufd_ps(vQ1, BT_SHUFFLE(1,2,0,1));
756 B2 = bt_pshufd_ps(vQ2, BT_SHUFFLE(2,0,1,1));
760 A3 = bt_pshufd_ps(vQ1, BT_SHUFFLE(2,0,1,2));
761 B3 = bt_pshufd_ps(vQ2, BT_SHUFFLE(1,2,0,2));
766 A1 = _mm_xor_ps(A1, vPPPM);
771 #elif defined(BT_USE_NEON)
773 float32x4_t vQ1 = w.get128();
774 float32x4_t vQ2 = q.get128();
775 float32x4_t A1, B1, A2, B2, A3, B3;
776 float32x2_t vQ1zx, vQ2wx, vQ1yz, vQ2zx, vQ2yz, vQ2xz;
781 tmp = vtrn_f32( vget_high_f32(vQ1), vget_low_f32(vQ1) );
784 tmp = vtrn_f32( vget_high_f32(vQ2), vget_low_f32(vQ2) );
787 vQ2wx = vext_f32(vget_high_f32(vQ2), vget_low_f32(vQ2), 1);
789 vQ1yz = vext_f32(vget_low_f32(vQ1), vget_high_f32(vQ1), 1);
791 vQ2yz = vext_f32(vget_low_f32(vQ2), vget_high_f32(vQ2), 1);
792 vQ2xz = vext_f32(vQ2zx, vQ2zx, 1);
794 A1 = vcombine_f32(vget_low_f32(vQ1), vQ1zx);
795 B1 = vcombine_f32(vdup_lane_f32(vget_high_f32(vQ2), 1), vQ2wx);
797 A2 = vcombine_f32(vQ1yz, vget_low_f32(vQ1));
798 B2 = vcombine_f32(vQ2zx, vdup_lane_f32(vget_low_f32(vQ2), 1));
800 A3 = vcombine_f32(vQ1zx, vQ1yz);
801 B3 = vcombine_f32(vQ2yz, vQ2xz);
803 A1 = vmulq_f32(A1, B1);
804 A2 = vmulq_f32(A2, B2);
805 A3 = vmulq_f32(A3, B3);
807 A1 = vaddq_f32(A1, A2);
810 A1 = (btSimdFloat4)veorq_s32((int32x4_t)A1, (int32x4_t)vPPPM);
812 A1 = vsubq_f32(A1, A3);
818 +w.
x() * q.
w() + w.
y() * q.
z() - w.
z() * q.
y(),
819 +w.
y() * q.
w() + w.
z() * q.
x() - w.
x() * q.
z(),
820 +w.
z() * q.
w() + w.
x() * q.
y() - w.
y() * q.
x(),
821 -w.
x() * q.
x() - w.
y() * q.
y() - w.
z() * q.
z());
862 return q1.
slerp(q2, t);
870 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
871 return btVector3(_mm_and_ps(q.get128(), btvFFF0fMask));
872 #elif defined(BT_USE_NEON)
873 return btVector3((float32x4_t)vandq_s32((int32x4_t)q.get128(), btvFFF0Mask));
906 #endif //BT_SIMD__QUATERNION_H_