Added contributed cl-simd module by Alexander Gavrilov
This commit is contained in:
parent
0b0c437281
commit
29f46d0387
14 changed files with 3421 additions and 0 deletions
25
contrib/cl-simd/LICENSE
Normal file
25
contrib/cl-simd/LICENSE
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
(This is the MIT / X Consortium license as taken from
|
||||
http://www.opensource.org/licenses/mit-license.html on or about
|
||||
Monday; July 13, 2009)
|
||||
|
||||
Copyright (c) 2010 by Alexander Gavrilov
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||||
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
8
contrib/cl-simd/README
Normal file
8
contrib/cl-simd/README
Normal file
|
|
@ -0,0 +1,8 @@
|
|||
This module implements SSE intrinsic functions for ECL and SBCL.
|
||||
|
||||
NOTE: CURRENTLY THIS SHOULD BE CONSIDERED EXPERIMENTAL, AND
|
||||
SUBJECT TO INCOMPATIBLE CHANGES IN A FUTURE RELEASE.
|
||||
|
||||
Since the implementation is closely tied to the internals of
|
||||
the compiler, it should normally be obtained exclusively via
|
||||
the bundled contrib mechanism of the above implementations.
|
||||
44
contrib/cl-simd/cl-simd.asd
Normal file
44
contrib/cl-simd/cl-simd.asd
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Copyright (C) 2010, Alexander Gavrilov (angavrilov@gmail.com)
|
||||
;;;
|
||||
;;; This file defines the cl-simd ASDF system.
|
||||
;;;
|
||||
;;; Note that a completely independent definition
|
||||
;;; is used to build the system as an ECL contrib.
|
||||
|
||||
(defsystem :cl-simd
|
||||
:version "1.0"
|
||||
#+sb-building-contrib :pathname
|
||||
#+sb-building-contrib #p"SYS:CONTRIB;CL-SIMD;"
|
||||
:components
|
||||
#+(and sbcl sb-sse-intrinsics)
|
||||
((:file "sse-package")
|
||||
(:file "sbcl-core" :depends-on ("sse-package"))
|
||||
(:file "sse-intrinsics" :depends-on ("sbcl-core"))
|
||||
(:file "sbcl-functions" :depends-on ("sse-intrinsics"))
|
||||
(:file "sbcl-arrays" :depends-on ("sbcl-functions"))
|
||||
(:file "sse-array-defs" :depends-on ("sbcl-arrays"))
|
||||
(:file "sse-utils" :depends-on ("sse-array-defs")))
|
||||
#+(and ecl sse2)
|
||||
((:file "sse-package")
|
||||
(:file "ecl-sse-core" :depends-on ("sse-package"))
|
||||
(:file "sse-intrinsics" :depends-on ("ecl-sse-core"))
|
||||
(:file "sse-array-defs" :depends-on ("sse-intrinsics"))
|
||||
(:file "ecl-sse-utils" :depends-on ("sse-intrinsics"))
|
||||
(:file "sse-utils" :depends-on ("ecl-sse-utils")))
|
||||
#-(or (and sbcl sb-sse-intrinsics)
|
||||
(and ecl sse2))
|
||||
())
|
||||
|
||||
#+(or (and sbcl sb-sse-intrinsics)
|
||||
(and ecl sse2))
|
||||
(defmethod perform :after ((o load-op) (c (eql (find-system :cl-simd))))
|
||||
(provide :cl-simd))
|
||||
|
||||
(defmethod perform ((o test-op) (c (eql (find-system :cl-simd))))
|
||||
#+(or (and sbcl sb-sse-intrinsics)
|
||||
(and ecl sse2))
|
||||
(or (load (compile-file "test-sfmt.lisp"))
|
||||
(error "test-sfmt failed")))
|
||||
|
||||
250
contrib/cl-simd/cl-simd.texinfo
Normal file
250
contrib/cl-simd/cl-simd.texinfo
Normal file
|
|
@ -0,0 +1,250 @@
|
|||
@node cl-simd
|
||||
@section cl-simd
|
||||
@cindex SSE2 Intrinsics
|
||||
@cindex Intrinsics, SSE2
|
||||
|
||||
The @code{cl-simd} module provides access to SSE2 instructions
|
||||
(which are nowadays supported by any CPU compatible with x86-64)
|
||||
in the form of @emph{intrinsic functions}, similar to the way
|
||||
adopted by modern C compilers. It also provides some lisp-specific
|
||||
functionality, like setf-able intrinsics for accessing lisp arrays.
|
||||
|
||||
When this module is loaded, it defines an @code{:sse2} feature,
|
||||
which can be subsequently used for conditional compilation of
|
||||
code that depends on it. Intrinsic functions are available from
|
||||
the @code{sse} package.
|
||||
|
||||
This API, with minor technical differences, is supported by both
|
||||
ECL and SBCL (x86-64 only).
|
||||
|
||||
@menu
|
||||
* SSE pack types::
|
||||
* SSE array type::
|
||||
* Differences from C intrinsics::
|
||||
* Simple extensions::
|
||||
* Lisp array accessors::
|
||||
* Example::
|
||||
@end menu
|
||||
|
||||
@node SSE pack types
|
||||
@subsection SSE pack types
|
||||
|
||||
The package defines and/or exports the following types to
|
||||
represent 128-bit SSE register contents:
|
||||
|
||||
@anchor{Type sse:sse-pack}
|
||||
@deftp {Type} @somepkg{sse-pack,sse} @&optional item-type
|
||||
The generic SSE pack type.
|
||||
@end deftp
|
||||
|
||||
@anchor{Type sse:int-sse-pack}
|
||||
@deftp {Type} @somepkg{int-sse-pack,sse}
|
||||
Same as @code{(sse-pack integer)}.
|
||||
@end deftp
|
||||
|
||||
@anchor{Type sse:float-sse-pack}
|
||||
@deftp {Type} @somepkg{float-sse-pack,sse}
|
||||
Same as @code{(sse-pack single-float)}.
|
||||
@end deftp
|
||||
|
||||
@anchor{Type sse:double-sse-pack}
|
||||
@deftp {Type} @somepkg{double-sse-pack,sse}
|
||||
Same as @code{(sse-pack double-float)}.
|
||||
@end deftp
|
||||
|
||||
Declaring variable types using the subtype appropriate
|
||||
for your data is likely to lead to more efficient code
|
||||
(especially on ECL). However, the compiler implicitly
|
||||
casts between any subtypes of sse-pack when needed.
|
||||
|
||||
Printed representation of SSE packs can be controlled
|
||||
by binding @code{*sse-pack-print-mode*}:
|
||||
|
||||
@anchor{Variable sse:*sse-pack-print-mode*}
|
||||
@defvr {Variable} @somepkg{@earmuffs{sse-pack-print-mode},sse}
|
||||
When set to one of @code{:int}, @code{:float} or
|
||||
@code{:double}, specifies the way SSE packs are
|
||||
printed. A @code{NIL} value (default) instructs
|
||||
the implementation to make its best effort to
|
||||
guess from the data and context.
|
||||
@end defvr
|
||||
|
||||
@node SSE array type
|
||||
@subsection SSE array type
|
||||
|
||||
@anchor{Type sse:sse-array}
|
||||
@deftp {Type} @somepkg{sse-array,sse} element-type @&optional dimensions
|
||||
Expands to a lisp array type that is efficiently
|
||||
supported by AREF-like accessors.
|
||||
It should be assumed to be a subtype of @code{SIMPLE-ARRAY}.
|
||||
The type expander signals warnings or errors if it detects
|
||||
that the element-type argument value is inappropriate or unsafe.
|
||||
@end deftp
|
||||
|
||||
@anchor{Function sse:make-sse-array}
|
||||
@deffn {Function} @somepkg{make-sse-array,sse} dimensions @&key element-type initial-element displaced-to displaced-index-offset
|
||||
Creates an object of type @code{sse-array}, or signals an error.
|
||||
In non-displaced case ensures alignment of the beginning of data to
|
||||
the 16-byte boundary.
|
||||
Unlike @code{make-array}, the element type defaults to (unsigned-byte 8).
|
||||
@end deffn
|
||||
|
||||
On ECL this function supports full-featured displacement.
|
||||
On SBCL it has to simulate it by sharing the underlying
|
||||
data vector, and does not support nonzero index offset.
|
||||
|
||||
@node Differences from C intrinsics
|
||||
@subsection Differences from C intrinsics
|
||||
|
||||
Intel Compiler, GCC and
|
||||
@url{http://msdn.microsoft.com/en-us/library/y0dh78ez%28VS.80%29.aspx,MSVC}
|
||||
all support the same set
|
||||
of SSE intrinsics, originally designed by Intel. This
|
||||
package generally follows the naming scheme of the C
|
||||
version, with the following exceptions:
|
||||
|
||||
@itemize
|
||||
@item
|
||||
Underscores are replaced with dashes, and the @code{_mm_}
|
||||
prefix is removed in favor of packages.
|
||||
|
||||
@item
|
||||
The 'e' from @code{epi} is dropped because MMX is obsolete
|
||||
and won't be supported.
|
||||
|
||||
@item
|
||||
@code{_si128} functions are renamed to @code{-pi} for uniformity
|
||||
and brevity. The author has personally found this discrepancy
|
||||
in the original C intrinsics naming highly jarring.
|
||||
|
||||
@item
|
||||
Comparisons are named using graphic characters, e.g. @code{<=-ps}
|
||||
for @code{cmpleps}, or @code{/>-ps} for @code{cmpngtps}. In some
|
||||
places the set of comparison functions is extended to cover the
|
||||
full possible range.
|
||||
|
||||
@item
|
||||
Conversion functions are renamed to @code{convert-*-to-*} and
|
||||
@code{truncate-*-to-*}.
|
||||
|
||||
@item
|
||||
A few functions are completely renamed: @code{cpu-mxcsr} (setf-able),
|
||||
@code{cpu-pause}, @code{cpu-load-fence}, @code{cpu-store-fence},
|
||||
@code{cpu-memory-fence}, @code{cpu-clflush}, @code{cpu-prefetch-*}.
|
||||
@end itemize
|
||||
|
||||
In addition, foreign pointer access intrinsics have an additional
|
||||
optional integer offset parameter to allow more efficient coding
|
||||
of pointer deference, and the most common ones have been renamed
|
||||
and made SETF-able:
|
||||
|
||||
@itemize
|
||||
@item
|
||||
@code{mem-ref-ss}, @code{mem-ref-ps}, @code{mem-ref-aps}
|
||||
|
||||
@item
|
||||
@code{mem-ref-sd}, @code{mem-ref-pd}, @code{mem-ref-apd}
|
||||
|
||||
@item
|
||||
@code{mem-ref-pi}, @code{mem-ref-api}, @code{mem-ref-si64}
|
||||
@end itemize
|
||||
|
||||
(The @code{-ap*} version requires alignment.)
|
||||
|
||||
@node Simple extensions
|
||||
@subsection Simple extensions
|
||||
|
||||
This module extends the set of basic intrinsics with the following
|
||||
simple compound functions:
|
||||
|
||||
@itemize
|
||||
@item
|
||||
@code{neg-ss}, @code{neg-ps}, @code{neg-sd}, @code{neg-pd},
|
||||
@code{neg-pi8}, @code{neg-pi16}, @code{neg-pi32}, @code{neg-pi64}:
|
||||
|
||||
implement numeric negation of the corresponding data type.
|
||||
|
||||
@item
|
||||
@code{not-ps}, @code{not-pd}, @code{not-pi}:
|
||||
|
||||
implement bitwise logical inversion.
|
||||
|
||||
@item
|
||||
@code{if-ps}, @code{if-pd}, @code{if-pi}:
|
||||
|
||||
perform element-wise combining of two values based on a boolean
|
||||
condition vector produced as a combination of comparison function
|
||||
results through bitwise logical functions.
|
||||
|
||||
The condition value must use all-zero bitmask for false, and
|
||||
all-one bitmask for true as a value for each logical vector
|
||||
element. The result is undefined if any other bit pattern is used.
|
||||
|
||||
N.B.: these are @emph{functions}, so both branches of the
|
||||
conditional are always evaluated.
|
||||
@end itemize
|
||||
|
||||
The module also provides symbol macros that expand into expressions
|
||||
producing certain constants in the most efficient way:
|
||||
|
||||
@itemize
|
||||
@item
|
||||
0.0-ps 0.0-pd 0-pi for zero
|
||||
|
||||
@item
|
||||
true-ps true-pd true-pi for all 1 bitmask
|
||||
|
||||
@item
|
||||
false-ps false-pd false-pi for all 0 bitmask (same as zero)
|
||||
@end itemize
|
||||
|
||||
@node Lisp array accessors
|
||||
@subsection Lisp array accessors
|
||||
|
||||
In order to provide better integration with ordinary lisp code,
|
||||
this module implements a set of AREF-like memory accessors:
|
||||
|
||||
@itemize
|
||||
@item
|
||||
@code{(ROW-MAJOR-)?AREF-PREFETCH-(T0|T1|T2|NTA)} for cache prefetch.
|
||||
|
||||
@item
|
||||
@code{(ROW-MAJOR-)?AREF-CLFLUSH} for cache flush.
|
||||
|
||||
@item
|
||||
@code{(ROW-MAJOR-)?AREF-[AS]?P[SDI]} for whole-pack read & write.
|
||||
@end itemize
|
||||
|
||||
(Where A = aligned; S = aligned streamed write.)
|
||||
|
||||
These accessors can be used with any non-bit specialized
|
||||
array or vector, without restriction on the precise element
|
||||
type (although it should be declared at compile time to
|
||||
ensure generation of the fastest code).
|
||||
|
||||
Additional index bound checking is done to ensure that 16
|
||||
bytes of memory are accessible after the specified index.
|
||||
|
||||
As an exception, ROW-MAJOR-AREF-PREFETCH-* does not do any
|
||||
range checks at all, because the prefetch instructions
|
||||
are officially safe to use with bad addresses. The
|
||||
AREF-PREFETCH-* and *-CLFLUSH functions do only ordinary
|
||||
index checks without the 16-byte extension.
|
||||
|
||||
@node Example
|
||||
@subsection Example
|
||||
|
||||
This code processes several single-float arrays, storing
|
||||
either the value of a*b, or c/3.5 into result, depending
|
||||
on the sign of mode:
|
||||
|
||||
@example
|
||||
(loop for i from 0 below 128 by 4
|
||||
do (setf (aref-ps result i)
|
||||
(if-ps (<-ps (aref-ps mode i) 0.0-ps)
|
||||
(mul-ps (aref-ps a i) (aref-ps b i))
|
||||
(div-ps (aref-ps c i) (set1-ps 3.5)))))
|
||||
@end example
|
||||
|
||||
As already noted above, both branches of the if are always
|
||||
evaluated.
|
||||
310
contrib/cl-simd/ecl-sse-core.lisp
Normal file
310
contrib/cl-simd/ecl-sse-core.lisp
Normal file
|
|
@ -0,0 +1,310 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
|
||||
;;;
|
||||
;;; This file defines macros for wrapping C-level SSE intrinsics.
|
||||
;;;
|
||||
|
||||
(in-package #:SSE)
|
||||
|
||||
;;; The compound SSE pack type
|
||||
|
||||
(deftype sse-pack (&optional item)
|
||||
(ecase item
|
||||
(* 'ext:sse-pack)
|
||||
((single-float float) 'float-sse-pack)
|
||||
(double-float 'double-sse-pack)
|
||||
(integer 'int-sse-pack)))
|
||||
|
||||
;;; Helper macros and functions
|
||||
|
||||
(defmacro typename-case (value &body clauses)
|
||||
"Syntax: (case value &body clauses)"
|
||||
`(cond ,@(mapcar (lambda (clause)
|
||||
`((subtypep ,value ',(first clause))
|
||||
,@(rest clause)))
|
||||
clauses)
|
||||
(t (error "Unsupported type name: ~S" ,value))))
|
||||
|
||||
(defun foreign-type-of (lt)
|
||||
(typename-case lt
|
||||
(nil :object)
|
||||
(int-sse-pack :int-sse-pack)
|
||||
(float-sse-pack :float-sse-pack)
|
||||
(double-sse-pack :double-sse-pack)
|
||||
(single-float :float)
|
||||
(double-float :double)
|
||||
(fixnum :fixnum)
|
||||
#+uint32-t
|
||||
(ext:integer32 :int32-t)
|
||||
#+uint32-t
|
||||
(ext:byte32 :uint32-t)
|
||||
#+uint64-t
|
||||
(ext:integer64 :int64-t)
|
||||
#+uint64-t
|
||||
(ext:byte64 :uint64-t)
|
||||
(integer :fixnum)))
|
||||
|
||||
(defun pointer-c-type-of (lt)
|
||||
(typename-case lt
|
||||
(nil "void")
|
||||
(int-sse-pack "__m128i")
|
||||
(float-sse-pack "float")
|
||||
(double-sse-pack "double")))
|
||||
|
||||
;; Accept any real values for floating-point arguments:
|
||||
(defun declaim-arg-type-of (lt)
|
||||
(typename-case lt
|
||||
((or single-float double-float) 'real)
|
||||
(ext:sse-pack 'ext:sse-pack)
|
||||
(fixnum 'fixnum)
|
||||
(t lt)))
|
||||
|
||||
(defun inline-arg-type-of (lt)
|
||||
(typename-case lt
|
||||
((or single-float double-float) 'c::fixnum-float)
|
||||
(fixnum 'fixnum)
|
||||
(t lt)))
|
||||
|
||||
;; Constant expansion
|
||||
(defun expand-constant (form env &optional chgp)
|
||||
(let* ((mform (macroexpand form env))
|
||||
(cform (cond ((and (symbolp mform) (constantp mform))
|
||||
(symbol-value mform))
|
||||
(t mform))))
|
||||
(values cform (or chgp (not (eql cform form))))))
|
||||
|
||||
;; Macro helpers
|
||||
(defun make-arg-name (index)
|
||||
(intern (format nil "ARG~A" index)))
|
||||
|
||||
(defun make-arg-nums (lst)
|
||||
(loop for i from 0 below (length lst) collect i))
|
||||
|
||||
(defun wrap-ret-arg (core ret-type &optional ret-arg)
|
||||
(cond ((eq ret-type nil)
|
||||
(format nil "(~A,Cnil)" core))
|
||||
(ret-arg
|
||||
(format nil "@~36R;(~A,#~36R)" ret-arg core ret-arg))
|
||||
(t core)))
|
||||
|
||||
;; Constant generation
|
||||
(defun make-pack-of-bin (bin-value &key (as 'int-sse-pack))
|
||||
(let* ((all (loop for i from 0 to 15
|
||||
for v = bin-value then (ash v -8)
|
||||
collect (logand v 255)))
|
||||
(pack (ext:vector-to-sse-pack
|
||||
(make-array 16 :element-type '(unsigned-byte 8) :initial-contents all))))
|
||||
(if (eq as 'int-sse-pack)
|
||||
pack
|
||||
`(the ,as ,(ext:sse-pack-as-elt-type
|
||||
pack (ecase as
|
||||
(EXT:FLOAT-SSE-PACK 'single-float)
|
||||
(EXT:DOUBLE-SSE-PACK 'double-float)))))))
|
||||
|
||||
(defmacro def-inline (name mode arg-types ret-type call-str &rest flags)
|
||||
`(eval-when (:compile-toplevel :load-toplevel)
|
||||
(c::def-inline ',name ',mode ',arg-types ',ret-type ,call-str ,@flags)))
|
||||
|
||||
(defmacro def-intrinsic (name arg-types ret-type c-name
|
||||
&key (export t) ret-arg reorder-args immediate-args)
|
||||
"Defines and exports an SSE intrinsic function with matching open-coding rules."
|
||||
(let* ((anums (make-arg-nums arg-types))
|
||||
(asyms (mapcar #'make-arg-name anums))
|
||||
(aftypes (mapcar #'foreign-type-of arg-types))
|
||||
(rftype (foreign-type-of ret-type))
|
||||
(call-anums (if reorder-args (reverse anums) anums))
|
||||
(call-str (wrap-ret-arg (format nil "~A(~{#~36R~^,~})" c-name call-anums) ret-type ret-arg)))
|
||||
`(progn
|
||||
,(if export `(export ',name))
|
||||
,@(if immediate-args ; Generate a constantness verifier macro
|
||||
`((define-compiler-macro ,name (&whole whole &environment env ,@asyms &aux chgp)
|
||||
,@(loop for (arg type) in immediate-args
|
||||
collect `(let ((oldv ,arg))
|
||||
(multiple-value-setq (,arg chgp) (expand-constant oldv env chgp))
|
||||
(unless (typep ,arg ',type)
|
||||
(c::cmperr "In call to ~A: Argument ~S~@[ = ~S~] is not a constant of type ~A"
|
||||
',name oldv (unless (eql oldv ,arg) ,arg) ',type))))
|
||||
(if chgp (list ',name ,@asyms) whole))))
|
||||
(proclaim '(ftype (function ,(mapcar #'declaim-arg-type-of arg-types) ,(or ret-type 'null)) ,name))
|
||||
,@(if (null immediate-args)
|
||||
`((defun ,name ,asyms
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1)))
|
||||
(ffi:c-inline ,asyms ,aftypes ,rftype ,call-str :one-liner t))))
|
||||
(def-inline ,name :always ,(mapcar #'inline-arg-type-of arg-types) ,rftype
|
||||
,call-str :inline-or-warn t))))
|
||||
|
||||
(defmacro def-unary-intrinsic (name ret-type insn cost c-name
|
||||
&key (arg-type ret-type) partial result-size immediate-arg)
|
||||
(declare (ignore insn cost partial result-size))
|
||||
`(def-intrinsic ,name (,arg-type ,@(if immediate-arg (list immediate-arg)))
|
||||
,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg1 ,immediate-arg)))))
|
||||
|
||||
(defmacro def-cvt-to-int32-intrinsic (name ret-type insn cost c-name
|
||||
&key (arg-type ret-type) partial immediate-arg)
|
||||
(declare (ignore insn cost partial))
|
||||
(assert (subtypep ret-type '(signed-byte 32)))
|
||||
`(def-intrinsic ,name (,arg-type ,@(if immediate-arg (list immediate-arg)))
|
||||
,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg1 ,immediate-arg)))))
|
||||
|
||||
(defmacro def-binary-intrinsic (name ret-type insn cost c-name
|
||||
&key (x-type ret-type) (y-type ret-type)
|
||||
commutative tags immediate-arg)
|
||||
(declare (ignore insn cost commutative tags))
|
||||
`(def-intrinsic ,name (,x-type ,y-type ,@(if immediate-arg (list immediate-arg)))
|
||||
,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg2 ,immediate-arg)))))
|
||||
|
||||
(defmacro def-sse-int-intrinsic (name int-type ret-type insn cost c-name
|
||||
&key (arg-type ret-type) immediate-arg make-temporary)
|
||||
(declare (ignore insn cost make-temporary))
|
||||
`(def-intrinsic ,name (,arg-type ,int-type ,@(if immediate-arg (list immediate-arg)))
|
||||
,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg2 ,immediate-arg)))))
|
||||
|
||||
(defmacro %def-aref-intrinsic (tag val-type c-type reader writer &key (aux-args "") (bsize 16))
|
||||
"Defines and exports macros and functios that implement vectorized array access."
|
||||
(let* ((rftype (foreign-type-of val-type))
|
||||
(aref-name (intern (format nil "AREF-~A" tag) *package*))
|
||||
(rm-aref-name (intern (format nil "ROW-MAJOR-AREF-~A" tag) *package*))
|
||||
(rm-aset-name (intern (format nil "ROW-MAJOR-ASET-~A" tag) *package*))
|
||||
(known-elt-types '((single-float "sf")
|
||||
(double-float "df")
|
||||
(ext:byte8 "b8")
|
||||
(ext:integer8 "i8")
|
||||
#+uint16-t (ext:byte16 "b16")
|
||||
#+uint16-t (ext:integer16 "i16")
|
||||
#+uint32-t (ext:byte32 "b32")
|
||||
#+uint32-t (ext:integer32 "i32")
|
||||
#+uint64-t (ext:byte64 "b64")
|
||||
#+uint64-t (ext:integer64 "i64"))))
|
||||
(flet ((fmtr (ptr-fmt &rest ptr-args)
|
||||
(wrap-ret-arg (format nil "~A((~A*)~?~A)"
|
||||
reader c-type ptr-fmt ptr-args aux-args)
|
||||
val-type))
|
||||
(fmtw (ptr-fmt &rest ptr-args)
|
||||
(wrap-ret-arg (format nil "~A((~A*)~?,#2)"
|
||||
writer c-type ptr-fmt ptr-args)
|
||||
val-type 2)))
|
||||
`(progn
|
||||
(export ',aref-name)
|
||||
(export ',rm-aref-name)
|
||||
(defmacro ,aref-name (array &rest indexes)
|
||||
(let ((varr (gensym "ARR")))
|
||||
`(let ((,varr ,array))
|
||||
(declare (:read-only ,varr))
|
||||
(,',rm-aref-name ,varr (array-row-major-index ,varr ,@indexes)))))
|
||||
(proclaim '(ftype (function (array fixnum) ,(or val-type 'null)) ,rm-aref-name))
|
||||
(defun ,rm-aref-name (array index)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 2)))
|
||||
(ffi:c-inline (array index) (:object :int) ,rftype
|
||||
,(fmtr "ecl_row_major_ptr(#0,#1,~A)" bsize)
|
||||
:one-liner t))
|
||||
;; AREF
|
||||
(def-inline ,rm-aref-name :always (t t) ,rftype
|
||||
,(fmtr "ecl_row_major_ptr(#0,fixint(#1),~A)" bsize)
|
||||
:inline-or-warn t)
|
||||
(def-inline ,rm-aref-name :always (t fixnum) ,rftype
|
||||
,(fmtr "ecl_row_major_ptr(#0,#1,~A)" bsize))
|
||||
;; AREF unsafe
|
||||
,@(mapcar (lambda (spec)
|
||||
`(def-inline ,rm-aref-name :unsafe ((array ,(first spec)) fixnum) ,rftype
|
||||
,(fmtr "(&(#0)->array.self.~A[#1])" (second spec))))
|
||||
known-elt-types)
|
||||
,@(if writer
|
||||
`((define-setf-expander ,aref-name (array &rest indexes)
|
||||
(let ((varr (gensym)) (vidx (gensym)) (vval (gensym)))
|
||||
(values (list varr vidx)
|
||||
(list array `(array-row-major-index ,varr ,@indexes))
|
||||
(list vval)
|
||||
`(,',rm-aset-name ,varr ,vidx ,vval) `(,',rm-aref-name ,varr ,vidx))))
|
||||
(proclaim '(ftype (function (array fixnum ,(declaim-arg-type-of val-type)) ,val-type) ,rm-aset-name))
|
||||
(defun ,rm-aset-name (array index value)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 2)))
|
||||
(prog1 value
|
||||
(ffi:c-inline (array index value) (:object :int ,rftype) :void
|
||||
,(fmtw "ecl_row_major_ptr(#0,#1,~A)" bsize)
|
||||
:one-liner t)))
|
||||
(defsetf ,rm-aref-name ,rm-aset-name)
|
||||
;; ASET
|
||||
(def-inline ,rm-aset-name :always (t t ,val-type) ,rftype
|
||||
,(fmtw "ecl_row_major_ptr(#0,fixint(#1),~A)" bsize)
|
||||
:inline-or-warn t)
|
||||
(def-inline ,rm-aset-name :always (t fixnum ,val-type) ,rftype
|
||||
,(fmtw "ecl_row_major_ptr(#0,#1,~A)" bsize))
|
||||
;; ASET unsafe
|
||||
,@(mapcar (lambda (spec)
|
||||
`(def-inline ,rm-aset-name :unsafe ((array ,(first spec)) fixnum ,val-type) ,rftype
|
||||
,(fmtw "(&(#0)->array.self.~A[#1])" (second spec))))
|
||||
known-elt-types)))))))
|
||||
|
||||
(defmacro def-aref-intrinsic (tag val-type reader-fun writer-fun &key (check-bounds t))
|
||||
`(%def-aref-intrinsic ,tag ,val-type ,(pointer-c-type-of val-type)
|
||||
,(get reader-fun 'c-function-name) ,(get writer-fun 'c-function-name)
|
||||
:bsize ,(ecase check-bounds
|
||||
(t 16) ((nil) 0) (:no-gap 1))
|
||||
:aux-args ,(get reader-fun 'c-call-aux-args)))
|
||||
|
||||
(defmacro def-mem-intrinsic (name c-type ret-type c-name &key (public t)
|
||||
prefix-args (prefix-fmt "~@{#~36R,~}")
|
||||
postfix-args (postfix-fmt "~@{,#~36R~}" pf-p) ret-arg)
|
||||
"Defines and exports an SSE memory access intrinsic function with matching open-coding rules."
|
||||
(let* ((anums (make-arg-nums (append prefix-args postfix-args)))
|
||||
(asyms (mapcar #'make-arg-name anums))
|
||||
(prefix-nums (subseq anums 0 (length prefix-args)))
|
||||
(postfix-nums (mapcar #'1+ (subseq anums (length prefix-args))))
|
||||
(prefix-syms (subseq asyms 0 (length prefix-args)))
|
||||
(postfix-syms (subseq asyms (length prefix-args)))
|
||||
(prefix-itypes (mapcar #'inline-arg-type-of prefix-args))
|
||||
(postfix-itypes (mapcar #'inline-arg-type-of postfix-args))
|
||||
(rftype (foreign-type-of ret-type))
|
||||
(ptr-idx (length prefix-args))
|
||||
(offset-idx (+ ptr-idx 1 (length postfix-args))))
|
||||
(flet ((fmt (ptr-text)
|
||||
(wrap-ret-arg (format nil "~A(~?(~A*)~?~?)"
|
||||
c-name prefix-fmt prefix-nums
|
||||
c-type ptr-text (list ptr-idx offset-idx)
|
||||
postfix-fmt postfix-nums)
|
||||
ret-type ret-arg)))
|
||||
`(progn
|
||||
,(when public `(export ',name))
|
||||
(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||
(setf (get ',name 'c-function-name) ,c-name)
|
||||
,(if (and pf-p (null postfix-args))
|
||||
`(setf (get ',name 'c-call-aux-args) ,postfix-fmt)))
|
||||
(proclaim '(ftype (function (,@(mapcar #'declaim-arg-type-of prefix-args) si:foreign-data
|
||||
,@(mapcar #'declaim-arg-type-of postfix-args) &optional fixnum) ,ret-type) ,name))
|
||||
(defun ,name (,@prefix-syms ptr ,@postfix-syms &optional (offset 0))
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1)))
|
||||
(ffi:c-inline (,@prefix-syms ptr ,@postfix-syms offset)
|
||||
(,@(mapcar #'foreign-type-of prefix-args) :pointer-void
|
||||
,@(mapcar #'foreign-type-of postfix-args) :int) ,rftype
|
||||
,(fmt "(((char*)#~A) + #~A)")
|
||||
:one-liner t))
|
||||
(def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes) ,rftype
|
||||
,(fmt "ecl_to_pointer(#~A)")
|
||||
:inline-or-warn t)
|
||||
(def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes t) ,rftype
|
||||
,(fmt "(((char*)ecl_to_pointer(#~A)) + fixint(#~A))"))
|
||||
(def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes fixnum) ,rftype
|
||||
,(fmt "(((char*)ecl_to_pointer(#~A)) + #~A)"))
|
||||
(def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes) ,rftype
|
||||
,(fmt "(#~A)->foreign.data"))
|
||||
(def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes t) ,rftype
|
||||
,(fmt "(((char*)(#~A)->foreign.data) + fix(#~A))"))
|
||||
(def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes fixnum) ,rftype
|
||||
,(fmt "(((char*)(#~A)->foreign.data) + #~A)"))))))
|
||||
|
||||
(defmacro def-load-intrinsic (name ret-type insn c-name &key register-arg tags size postfix-fmt)
|
||||
(declare (ignore insn tags size))
|
||||
`(def-mem-intrinsic ,name ,(pointer-c-type-of ret-type) ,ret-type ,c-name
|
||||
:prefix-args ,(if register-arg (list ret-type))
|
||||
:postfix-fmt ,(or postfix-fmt "")))
|
||||
|
||||
(defmacro def-store-intrinsic (name ret-type insn c-name &key setf-name)
|
||||
(declare (ignore insn))
|
||||
`(progn
|
||||
(def-mem-intrinsic ,name ,(pointer-c-type-of ret-type) ,ret-type ,c-name
|
||||
:public ,(not setf-name) :postfix-args (,ret-type) :ret-arg 1)
|
||||
,(if setf-name
|
||||
`(defsetf ,setf-name (pointer &optional (offset 0)) (value)
|
||||
`(,',name ,pointer ,value ,offset)))))
|
||||
|
||||
398
contrib/cl-simd/ecl-sse-utils.lisp
Normal file
398
contrib/cl-simd/ecl-sse-utils.lisp
Normal file
|
|
@ -0,0 +1,398 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
|
||||
;;;
|
||||
;;; This file defines some extensions to the base intrinsic set,
|
||||
;;; and other utility functions.
|
||||
;;;
|
||||
|
||||
(in-package #:SSE)
|
||||
|
||||
;;; Helper macros and functions
|
||||
(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||
;; Try using a matching inverse function name
|
||||
(defun lookup-flip (arg pairs &key no-reverse)
|
||||
(and (consp arg)
|
||||
(let ((fix (or (cdr (assoc (first arg) pairs))
|
||||
(unless no-reverse
|
||||
(car (rassoc (first arg) pairs))))))
|
||||
(cond ((eq fix :identity)
|
||||
(assert (null (cddr arg)))
|
||||
(second arg))
|
||||
(fix
|
||||
`(,fix ,@(rest arg)))
|
||||
(t nil)))))
|
||||
;; Macroexpand, plus compiler expand some specific names
|
||||
(defun expand-condition (form env)
|
||||
(setq form (macroexpand form env))
|
||||
(loop while (and (consp form)
|
||||
(symbolp (first form))
|
||||
(get (first form) 'expand-in-condition))
|
||||
do (setq form (c::cmp-expand-macro (compiler-macro-function (first form))
|
||||
form env)))
|
||||
form)
|
||||
;; Checks if the form is an unary call
|
||||
(defun is-unary? (form op)
|
||||
(and (consp form)
|
||||
(eq (first form) op)
|
||||
(null (cddr form))))
|
||||
;; IF-style function expander
|
||||
(defun expand-if-macro (condition then-value else-value env if-f not-f or-f and-f andnot-f type-name zero-val &key flip)
|
||||
(let* ((condition (expand-condition condition env))
|
||||
(then-value (macroexpand then-value env))
|
||||
(else-value (macroexpand else-value env))
|
||||
(then-zero? (equal then-value zero-val))
|
||||
(else-zero? (equal else-value zero-val)))
|
||||
(cond ((is-unary? condition not-f)
|
||||
(expand-if-macro (second condition) else-value then-value
|
||||
env if-f not-f or-f and-f andnot-f type-name zero-val
|
||||
:flip (not flip)))
|
||||
((and then-zero? else-zero?)
|
||||
zero-val)
|
||||
(then-zero?
|
||||
`(,andnot-f ,condition ,else-value))
|
||||
(else-zero?
|
||||
`(,and-f ,condition ,then-value))
|
||||
(t
|
||||
(let* ((csym (gensym))
|
||||
(args `((,and-f ,csym ,then-value)
|
||||
(,andnot-f ,csym ,else-value))))
|
||||
`(let ((,csym ,condition))
|
||||
(declare (type ,type-name ,csym)
|
||||
(:read-only ,csym))
|
||||
(,or-f ,@(if flip (reverse args) args)))))))))
|
||||
|
||||
(defmacro def-utility (name arg-types ret-type expansion &key expand-args expand-in-condition)
|
||||
"Defines and exports a function & compiler macro with the specified expansion."
|
||||
(let* ((anames (mapcar #'make-arg-name (make-arg-nums arg-types))))
|
||||
`(progn
|
||||
(export ',name)
|
||||
(eval-when (:compile-toplevel :load-toplevel)
|
||||
,@(if expand-in-condition
|
||||
`((setf (get ',name 'expand-in-condition) t)))
|
||||
(define-compiler-macro ,name (&environment env ,@anames)
|
||||
(declare (ignorable env))
|
||||
,@(loop for arg in (if (eq expand-args t) anames expand-args)
|
||||
collect `(setq ,arg (macroexpand ,arg env)))
|
||||
,expansion))
|
||||
(proclaim '(ftype (function ,(mapcar #'declaim-arg-type-of arg-types) ,ret-type) ,name))
|
||||
(defun ,name ,anames
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1)))
|
||||
(let ,(mapcar #'list anames anames)
|
||||
(declare ,@(loop for an in anames and at in arg-types
|
||||
collect `(type ,at ,an)))
|
||||
;; Depends on the compiler macro being expanded:
|
||||
(,name ,@anames))))))
|
||||
|
||||
(defmacro def-if-function (name type-name postfix)
|
||||
`(def-utility ,name (,type-name ,type-name ,type-name) ,type-name
|
||||
(expand-if-macro arg0 arg1 arg2 env
|
||||
',name
|
||||
',(intern (format nil "NOT-~A" postfix))
|
||||
',(intern (format nil "OR-~A" postfix))
|
||||
',(intern (format nil "AND-~A" postfix))
|
||||
',(intern (format nil "ANDNOT-~A" postfix))
|
||||
',type-name
|
||||
'(,(intern (format nil "SETZERO-~A" postfix))))))
|
||||
|
||||
;;; Aligned array allocation
|
||||
|
||||
(deftype sse-array (elt-type &optional dims)
|
||||
"Type of arrays efficiently accessed by SSE aref intrinsics and returned by make-sse-array.
|
||||
Should be assumed to be SIMPLE-ARRAY, except that displacing with MAKE-SSE-ARRAY is allowed."
|
||||
(when (eq elt-type '*)
|
||||
(c::cmperr "SSE-ARRAY must have a specific element type."))
|
||||
(let ((upgraded (upgraded-array-element-type elt-type)))
|
||||
(when (member upgraded '(t bit))
|
||||
(c::cmperr "Invalid SSE-ARRAY element type: ~S" elt-type))
|
||||
(unless (subtypep upgraded elt-type)
|
||||
(c::cmpwarn "SSE-ARRAY element type ~S has been upgraded to ~S" elt-type upgraded))
|
||||
`(array ,upgraded ,dims)))
|
||||
|
||||
(defun make-sse-array (dimensions &rest args &key (element-type '(unsigned-byte 8)) displaced-to &allow-other-keys)
|
||||
"Allocates an SSE-ARRAY aligned to the 16-byte boundary. May flatten displacement chains for performance reasons."
|
||||
(if displaced-to
|
||||
(apply #'make-array dimensions args)
|
||||
(multiple-value-bind (elt-size adj-type)
|
||||
(array-element-type-byte-size element-type)
|
||||
(when (eq adj-type t)
|
||||
(error "Cannot use element type T with SSE."))
|
||||
(sys::remf args :element-type)
|
||||
(let* ((full-size (if (numberp dimensions)
|
||||
dimensions
|
||||
(reduce #'* dimensions)))
|
||||
(padded-size (+ full-size (ceiling 15 elt-size)))
|
||||
(array (apply #'make-array padded-size :element-type adj-type args))
|
||||
(misalign (ffi:c-inline (array) (:object) :int
|
||||
"(((unsigned long)(#0)->array.self.b8) & 15)"
|
||||
:one-liner t))
|
||||
(offset (/ (if (> misalign 0) (- 16 misalign) 0) elt-size)))
|
||||
(make-array dimensions :element-type element-type
|
||||
:displaced-to array :displaced-index-offset offset)))))
|
||||
|
||||
;;; Single-float tools
|
||||
|
||||
;; Constants
|
||||
|
||||
(defmacro set-true-ss ()
|
||||
(load-time-value (make-pack-of-bin #xFFFFFFFF :as 'float-sse-pack)))
|
||||
|
||||
(defmacro set-true-ps ()
|
||||
(load-time-value (make-pack-of-bin -1 :as 'float-sse-pack)))
|
||||
|
||||
(eval-when (:compile-toplevel :load-toplevel)
|
||||
(define-symbol-macro 0.0-ps (setzero-ps))
|
||||
|
||||
(define-symbol-macro true-ss (set-true-ss))
|
||||
(define-symbol-macro false-ss (setzero-ps))
|
||||
|
||||
(define-symbol-macro true-ps (set-true-ps))
|
||||
(define-symbol-macro false-ps (setzero-ps)))
|
||||
|
||||
;; Bitwise if
|
||||
|
||||
(def-if-function if-ps float-sse-pack #:ps)
|
||||
|
||||
;; Arithmetic negation (xor with negative zero)
|
||||
|
||||
(def-utility neg-ss (float-sse-pack) float-sse-pack
|
||||
`(xor-ps ,arg0 ,(load-time-value (make-pack-of-bin #x80000000 :as 'float-sse-pack))))
|
||||
|
||||
(def-utility neg-ps (float-sse-pack) float-sse-pack
|
||||
`(xor-ps ,arg0 ,(load-time-value
|
||||
(make-pack-of-bin #x80000000800000008000000080000000 :as 'float-sse-pack))))
|
||||
|
||||
;; Logical inversion
|
||||
|
||||
(def-utility not-ps (float-sse-pack) float-sse-pack
|
||||
(or (lookup-flip arg0 '((=-ps . /=-ps)
|
||||
(<-ps . /<-ps)
|
||||
(<=-ps . /<=-ps)
|
||||
(>-ps . />-ps)
|
||||
(>=-ps . />=-ps)
|
||||
(cmpord-ps . cmpunord-ps)
|
||||
(not-ps . :identity)))
|
||||
`(xor-ps ,arg0 true-ps))
|
||||
:expand-args t)
|
||||
|
||||
;; Shuffle
|
||||
|
||||
(defun shuffle-ps (x y mask)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1))
|
||||
(type t x y mask))
|
||||
(check-type x sse-pack)
|
||||
(check-type y sse-pack)
|
||||
(check-type mask (unsigned-byte 8))
|
||||
(ffi:c-inline (x y mask) (:object :object :int) :float-sse-pack
|
||||
"_mm_setr_ps(
|
||||
(#0)->sse.data.sf[(#2)&3],
|
||||
(#0)->sse.data.sf[((#2)>>2)&3],
|
||||
(#1)->sse.data.sf[((#2)>>4)&3],
|
||||
(#1)->sse.data.sf[((#2)>>6)&3]
|
||||
)" :one-liner t))
|
||||
|
||||
;;; Double-float tools
|
||||
|
||||
;; Constants
|
||||
|
||||
(defmacro set-true-sd ()
|
||||
(load-time-value (make-pack-of-bin #xFFFFFFFFFFFFFFFF :as 'double-sse-pack)))
|
||||
|
||||
(defmacro set-true-pd ()
|
||||
(load-time-value (make-pack-of-bin -1 :as 'double-sse-pack)))
|
||||
|
||||
(eval-when (:compile-toplevel :load-toplevel)
|
||||
(define-symbol-macro 0.0-pd (setzero-pd))
|
||||
|
||||
(define-symbol-macro true-sd (set-true-sd))
|
||||
(define-symbol-macro false-sd (setzero-pd))
|
||||
|
||||
(define-symbol-macro true-pd (set-true-pd))
|
||||
(define-symbol-macro false-pd (setzero-pd)))
|
||||
|
||||
;; Bitwise if
|
||||
|
||||
(def-if-function if-pd double-sse-pack #:pd)
|
||||
|
||||
;; Arithmetic negation (xor with negative zero)
|
||||
|
||||
(def-utility neg-sd (double-sse-pack) double-sse-pack
|
||||
`(xor-pd ,arg0
|
||||
,(load-time-value
|
||||
(make-pack-of-bin #x8000000000000000 :as 'double-sse-pack))))
|
||||
|
||||
(def-utility neg-pd (double-sse-pack) double-sse-pack
|
||||
`(xor-pd ,arg0
|
||||
,(load-time-value
|
||||
(make-pack-of-bin #x80000000000000008000000000000000 :as 'double-sse-pack))))
|
||||
|
||||
;; Logical inversion
|
||||
|
||||
(def-utility not-pd (double-sse-pack) double-sse-pack
|
||||
(or (lookup-flip arg0 '((=-pd . /=-pd)
|
||||
(<-pd . /<-pd)
|
||||
(<=-pd . /<=-pd)
|
||||
(>-pd . />-pd)
|
||||
(>=-pd . />=-pd)
|
||||
(cmpord-pd . cmpunord-pd)
|
||||
(not-pd . :identity)))
|
||||
`(xor-pd ,arg0 true-pd))
|
||||
:expand-args t)
|
||||
|
||||
;; Shuffle
|
||||
|
||||
(defun shuffle-pd (x y mask)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1))
|
||||
(type t x y mask))
|
||||
(check-type x sse-pack)
|
||||
(check-type y sse-pack)
|
||||
(check-type mask (unsigned-byte 2))
|
||||
(ffi:c-inline (x y mask) (:object :object :int) :double-sse-pack
|
||||
"_mm_setr_pd(
|
||||
(#0)->sse.data.df[(#2)&1],
|
||||
(#1)->sse.data.df[((#2)>>1)&1]
|
||||
)" :one-liner t))
|
||||
|
||||
;;; Integer tools
|
||||
|
||||
;; Constants
|
||||
|
||||
(defmacro set-true-pi ()
|
||||
(load-time-value (make-pack-of-bin -1 :as 'int-sse-pack)))
|
||||
|
||||
(eval-when (:compile-toplevel :load-toplevel)
|
||||
(define-symbol-macro 0-pi (setzero-pi))
|
||||
|
||||
(define-symbol-macro true-pi (set-true-pi))
|
||||
(define-symbol-macro false-pi (setzero-pi)))
|
||||
|
||||
;; Bitwise if
|
||||
|
||||
(def-if-function if-pi float-sse-pack #:pi)
|
||||
|
||||
;; Arithmetic negation (subtract from 0)
|
||||
|
||||
(macrolet ((frob (name subf)
|
||||
`(def-utility ,name (int-sse-pack) int-sse-pack
|
||||
`(,',subf (setzero-pi) ,arg0))))
|
||||
(frob neg-pi8 sub-pi8)
|
||||
(frob neg-pi16 sub-pi16)
|
||||
(frob neg-pi32 sub-pi32)
|
||||
(frob neg-pi64 sub-pi64))
|
||||
|
||||
;; Logical inversion
|
||||
|
||||
(def-utility not-pi (int-sse-pack) int-sse-pack
|
||||
(or (lookup-flip arg0 '((<=-pi8 . >-pi8)
|
||||
(<=-pi16 . >-pi16)
|
||||
(<=-pi32 . >-pi32)
|
||||
(>=-pi8 . <-pi8)
|
||||
(>=-pi16 . <-pi16)
|
||||
(>=-pi32 . <-pi32)
|
||||
(/=-pi8 . =-pi8)
|
||||
(/=-pi16 . =-pi16)
|
||||
(/=-pi32 . =-pi32)
|
||||
(not-pi . :identity))
|
||||
:no-reverse t)
|
||||
`(xor-pi ,arg0 true-pi))
|
||||
:expand-args t)
|
||||
|
||||
(macrolet ((frob (name code)
|
||||
`(def-utility ,name (int-sse-pack int-sse-pack) int-sse-pack
|
||||
,code
|
||||
:expand-in-condition t)))
|
||||
|
||||
(frob <=-pi8 `(not-pi (>-pi8 ,arg0 ,arg1)))
|
||||
(frob <=-pi16 `(not-pi (>-pi16 ,arg0 ,arg1)))
|
||||
(frob <=-pi32 `(not-pi (>-pi32 ,arg0 ,arg1)))
|
||||
|
||||
(frob >=-pi8 `(not-pi (<-pi8 ,arg0 ,arg1)))
|
||||
(frob >=-pi16 `(not-pi (<-pi16 ,arg0 ,arg1)))
|
||||
(frob >=-pi32 `(not-pi (<-pi32 ,arg0 ,arg1)))
|
||||
|
||||
(frob /=-pi8 `(not-pi (=-pi8 ,arg0 ,arg1)))
|
||||
(frob /=-pi16 `(not-pi (=-pi16 ,arg0 ,arg1)))
|
||||
(frob /=-pi32 `(not-pi (=-pi32 ,arg0 ,arg1))))
|
||||
|
||||
;; Shifts
|
||||
|
||||
(defun slli-pi (x shift)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1))
|
||||
(type t x shift))
|
||||
(check-type x sse-pack)
|
||||
(check-type shift (unsigned-byte 8))
|
||||
(ffi:c-inline (x shift) (:object :int) :object
|
||||
"cl_object rv = ecl_make_int_sse_pack(_mm_setzero_si128());
|
||||
unsigned bshift=(#1), i;
|
||||
for (i = 0; i + bshift < 16; i++)
|
||||
rv->sse.data.b8[i+bshift] = (#0)->sse.data.b8[i];
|
||||
@(return) = rv;"))
|
||||
|
||||
(defun srli-pi (x shift)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1))
|
||||
(type t x shift))
|
||||
(check-type x sse-pack)
|
||||
(check-type shift (unsigned-byte 8))
|
||||
(ffi:c-inline (x shift) (:object :int) :object
|
||||
"cl_object rv = ecl_make_int_sse_pack(_mm_setzero_si128());
|
||||
int bshift=(#1), i;
|
||||
for (i = 16 - bshift - 1; i >= 0; i--)
|
||||
rv->sse.data.b8[i] = (#0)->sse.data.b8[i+bshift];
|
||||
@(return) = rv;"))
|
||||
|
||||
;; Extract & insert
|
||||
|
||||
(defun extract-pi16 (x index)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1))
|
||||
(type t x index))
|
||||
(check-type x sse-pack)
|
||||
(check-type index (unsigned-byte 8))
|
||||
(ffi:c-inline (x index) (:object :int) :fixnum
|
||||
"*((unsigned short*)&(#0)->sse.data.b8[((#1)&3)*2])"
|
||||
:one-liner t))
|
||||
|
||||
(defun insert-pi16 (x ival index)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1))
|
||||
(type t x ival index))
|
||||
(check-type x sse-pack)
|
||||
(check-type index (unsigned-byte 8))
|
||||
(ffi:c-inline (x ival index) (:int-sse-pack :int :int) :object
|
||||
"cl_object rv = ecl_make_int_sse_pack(#0);
|
||||
*((unsigned short*)&rv->sse.data.b8[((#2)&3)*2]) = (unsigned short)(#1);
|
||||
@(return) = rv;"))
|
||||
|
||||
;; Shuffles
|
||||
|
||||
(defun shuffle-pi32 (x mask)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1))
|
||||
(type t x mask))
|
||||
(check-type x sse-pack)
|
||||
(check-type mask (unsigned-byte 8))
|
||||
(ffi:c-inline (x mask) (:object :int) :int-sse-pack
|
||||
"unsigned *pd = (unsigned*)(#0)->sse.data.b8;
|
||||
@(return) = _mm_setr_epi32(pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[((#1)>>6)&3]);"))
|
||||
|
||||
(defun shufflelo-pi16 (x mask)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1))
|
||||
(type t x mask))
|
||||
(check-type x sse-pack)
|
||||
(check-type mask (unsigned-byte 8))
|
||||
(ffi:c-inline (x mask) (:object :int) :int-sse-pack
|
||||
"unsigned short *pd = (unsigned short*)(#0)->sse.data.b8;
|
||||
@(return) = _mm_setr_epi16(
|
||||
pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[(((#1)>>6)&3)],
|
||||
pd[4], pd[5], pd[6], pd[7]
|
||||
);"))
|
||||
|
||||
(defun shufflehi-pi16 (x mask)
|
||||
(declare (optimize (speed 0) (debug 0) (safety 1))
|
||||
(type t x mask))
|
||||
(check-type x sse-pack)
|
||||
(check-type mask (unsigned-byte 8))
|
||||
(ffi:c-inline (x mask) (:object :int) :int-sse-pack
|
||||
"unsigned short *pb = (unsigned short*)(#0)->sse.data.b8, *pd = pb+4;
|
||||
@(return) = _mm_setr_epi16(
|
||||
pb[0], pb[1], pb[2], pb[3],
|
||||
pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[(((#1)>>6)&3)]
|
||||
);"))
|
||||
|
||||
280
contrib/cl-simd/sbcl-arrays.lisp
Normal file
280
contrib/cl-simd/sbcl-arrays.lisp
Normal file
|
|
@ -0,0 +1,280 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
|
||||
;;;
|
||||
;;; This file contains the groundwork for vectorized
|
||||
;;; array access intrinsics.
|
||||
;;;
|
||||
|
||||
(in-package #:SSE)
|
||||
|
||||
;; SSE array element size calculation
|
||||
|
||||
(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||
(defun sse-elt-shift-from-saetp (info)
|
||||
(and info
|
||||
(subtypep (saetp-specifier info) 'number)
|
||||
(not (saetp-fixnum-p info))
|
||||
(case (saetp-n-bits info)
|
||||
(8 0) (16 1) (32 2) (64 3) (128 4)))))
|
||||
|
||||
(defglobal %%size-shift-table%%
|
||||
(let ((arr (make-array (1+ widetag-mask) :initial-element nil)))
|
||||
(loop
|
||||
for info across *specialized-array-element-type-properties*
|
||||
for shift = (sse-elt-shift-from-saetp info)
|
||||
when shift
|
||||
do (setf (svref arr (saetp-typecode info)) shift))
|
||||
arr)
|
||||
"A table of element size shifts for supported SSE array types.")
|
||||
|
||||
(declaim (inline sse-elt-shift-of)
|
||||
(ftype (function (t) (integer 0 4)) sse-elt-shift-of))
|
||||
|
||||
(defun sse-elt-shift-of (obj)
|
||||
"Returns the SSE element size shift for the given object,
|
||||
or fails if it is not a valid SSE vector."
|
||||
(declare (optimize (safety 0)))
|
||||
(the (integer 0 4)
|
||||
(or (svref %%size-shift-table%%
|
||||
(if (sb-vm::%other-pointer-p obj)
|
||||
(%other-pointer-widetag obj)
|
||||
0))
|
||||
(error 'type-error
|
||||
:datum obj
|
||||
:expected-type 'sse-array))))
|
||||
|
||||
;;; Type and allocation
|
||||
|
||||
(deftype sse-array (&optional (elt-type '* et-p) dims)
|
||||
"Type of arrays efficiently accessed by SSE aref intrinsics and returned by make-sse-array.
|
||||
Should be assumed to be SIMPLE-ARRAY, except that displacing with MAKE-SSE-ARRAY is allowed."
|
||||
(if (eq elt-type '*)
|
||||
(progn
|
||||
(when et-p
|
||||
(error "SSE-ARRAY must have a specific element type."))
|
||||
`(simple-array * ,dims))
|
||||
(let* ((upgraded (upgraded-array-element-type elt-type))
|
||||
(shift (sse-elt-shift-from-saetp (find-saetp upgraded))))
|
||||
(when (null shift)
|
||||
(error "Invalid SSE-ARRAY element type: ~S" elt-type))
|
||||
(unless (subtypep upgraded elt-type)
|
||||
(warn "SSE-ARRAY element type ~S has been upgraded to ~S" elt-type upgraded))
|
||||
`(simple-array ,upgraded ,dims))))
|
||||
|
||||
(defun make-sse-array (dimensions &key (element-type '(unsigned-byte 8)) (initial-element nil ie-p) displaced-to (displaced-index-offset 0))
|
||||
"Allocates an SSE-ARRAY aligned to the 16-byte boundary. Flattens displacement chains for performance reasons."
|
||||
(let* ((upgraded (upgraded-array-element-type element-type))
|
||||
(shift (sse-elt-shift-from-saetp (find-saetp upgraded))))
|
||||
(when (null shift)
|
||||
(error "Invalid SSE-ARRAY element type: ~S" element-type))
|
||||
(if displaced-to
|
||||
;; Fake displacement by allocating a simple-array header
|
||||
(let* ((dimensions (if (listp dimensions) dimensions (list dimensions)))
|
||||
(rank (length dimensions))
|
||||
(count (reduce #'* dimensions)))
|
||||
(unless (subtypep element-type (array-element-type displaced-to))
|
||||
(error "can't displace an array of type ~S into another of type ~S"
|
||||
element-type (array-element-type displaced-to)))
|
||||
(with-array-data ((data displaced-to)
|
||||
(start displaced-index-offset)
|
||||
(end))
|
||||
(unless (= start 0)
|
||||
(error "SSE-ARRAY does not support displaced index offset."))
|
||||
(unless (<= count end)
|
||||
(array-bounding-indices-bad-error data start count))
|
||||
(if (= rank 1)
|
||||
(progn
|
||||
(when (< count end)
|
||||
(warn "SSE-ARRAY displaced size extended to the full length of the vector."))
|
||||
data)
|
||||
(let ((new-array (make-array-header simple-array-widetag rank)))
|
||||
(set-array-header new-array data count nil 0 dimensions nil t)))))
|
||||
;; X86-64 vectors are already aligned to 16 bytes
|
||||
(apply #'make-array dimensions :element-type upgraded
|
||||
(if ie-p (list :initial-element initial-element))))))
|
||||
|
||||
;;; AREF intrinsic definition helpers
|
||||
|
||||
(defconstant +vector-data-fixup+
|
||||
(- (* vector-data-offset n-word-bytes) other-pointer-lowtag)
|
||||
"Offset from a tagged vector pointer to its data")
|
||||
|
||||
(defmacro array-data-expr (array-var &optional is-vector)
|
||||
(ecase is-vector
|
||||
(:yes array-var)
|
||||
(:no `(%array-data-vector ,array-var))
|
||||
((nil)
|
||||
`(if (array-header-p ,array-var)
|
||||
(%array-data-vector ,array-var)
|
||||
,array-var))))
|
||||
|
||||
;; Depends on the vector-length field being in the same place
|
||||
;; as the array fill pointer, which for simple-array is equal
|
||||
;; to the total size.
|
||||
(defknown %sse-array-size (simple-array fixnum) array-total-size (flushable always-translatable))
|
||||
|
||||
(define-vop (%sse-array-size/0)
|
||||
(:translate %sse-array-size)
|
||||
(:args (array :scs (descriptor-reg)))
|
||||
(:arg-types * (:constant (integer 0 0)))
|
||||
(:info gap)
|
||||
(:ignore gap)
|
||||
(:policy :fast-safe)
|
||||
(:results (result :scs (any-reg)))
|
||||
(:result-types tagged-num)
|
||||
(:generator 3
|
||||
(loadw result array vector-length-slot other-pointer-lowtag)))
|
||||
|
||||
(define-vop (%sse-array-size %sse-array-size/0)
|
||||
(:arg-types * (:constant (integer 1 16)))
|
||||
(:ignore)
|
||||
(:temporary (:sc any-reg) tmp)
|
||||
(:generator 8
|
||||
(loadw result array vector-length-slot other-pointer-lowtag)
|
||||
(inst mov tmp (fixnumize gap))
|
||||
(inst cmp result tmp)
|
||||
(inst cmov :ng tmp result)
|
||||
(inst sub result tmp)))
|
||||
|
||||
(defmacro with-sse-data (((sap-var data-var array) (offset-var index)) &body code)
|
||||
;; Compute a SAP and offset for the specified array and index. Check bounds.
|
||||
(with-unique-names (data-index data-end elt-shift access-size)
|
||||
(once-only ((array array)
|
||||
(index index))
|
||||
`(locally
|
||||
(declare (optimize (insert-array-bounds-checks 0)))
|
||||
(with-array-data ((,data-var ,array)
|
||||
(,data-index ,index)
|
||||
(,data-end))
|
||||
(let* ((,sap-var (int-sap (get-lisp-obj-address ,data-var)))
|
||||
(,elt-shift (sse-elt-shift-of ,data-var))
|
||||
(,access-size (ash 16 (- ,elt-shift)))
|
||||
(,offset-var (+ (ash ,data-index ,elt-shift) +vector-data-fixup+)))
|
||||
(declare (type system-area-pointer ,sap-var)
|
||||
(type fixnum ,offset-var))
|
||||
(unless (<= 0 ,data-index (+ ,data-index ,access-size) ,data-end)
|
||||
(array-bounding-indices-bad-error ,array ,index (+ ,index ,access-size)))
|
||||
,@code))))))
|
||||
|
||||
(defun sse-array-info-or-give-up (lvar)
|
||||
;; Look up the SSE element size and check if it is definitely a vector
|
||||
(let ((type (lvar-type lvar)))
|
||||
(unless (and (array-type-p type)
|
||||
(not (array-type-complexp type)))
|
||||
(give-up-ir1-transform "not a simple array"))
|
||||
(let* ((etype (array-type-specialized-element-type type))
|
||||
(shift (sse-elt-shift-from-saetp
|
||||
(if (eq etype *wild-type*) nil
|
||||
(find-saetp-by-ctype etype)))))
|
||||
(unless shift
|
||||
(give-up-ir1-transform "not a known SSE-compatible array element type: ~S"
|
||||
(type-specifier etype)))
|
||||
(values (ash 1 shift) ; step
|
||||
(1- (ash 16 (- shift))) ; gap
|
||||
(and (listp (array-type-dimensions type))
|
||||
(if (null (cdr (array-type-dimensions type))) :yes :no))))))
|
||||
|
||||
(defmacro def-aref-intrinsic (postfix rtype reader writer &key (check-bounds t))
|
||||
(let* ((rm-aref (symbolicate "ROW-MAJOR-AREF-" postfix))
|
||||
(rm-aset (if writer (symbolicate "ROW-MAJOR-ASET-" postfix)))
|
||||
(aref (symbolicate "AREF-" postfix))
|
||||
(aset (if writer (symbolicate "%ASET-" postfix)))
|
||||
(reader-vop (symbolicate "%" reader))
|
||||
(reader/ix-vop (symbolicate "%" reader "/IX"))
|
||||
(writer-vop (if writer (symbolicate "%" writer)))
|
||||
(writer/ix-vop (if writer (symbolicate "%" writer "/IX")))
|
||||
(rtype (or rtype '(values)))
|
||||
(index-expression
|
||||
(ecase check-bounds
|
||||
(t ``(the index (%check-bound array (%sse-array-size array ,gap) index)))
|
||||
((nil) ``(the index index))
|
||||
(:no-gap ``(the index (%check-bound array (%sse-array-size array 0) index))))))
|
||||
`(progn
|
||||
;; ROW-MAJOR-AREF
|
||||
(export ',rm-aref)
|
||||
(defknown ,rm-aref (array index) ,rtype (foldable flushable))
|
||||
(defun ,rm-aref (array index)
|
||||
(with-sse-data ((sap data array)
|
||||
(offset index))
|
||||
(,reader-vop sap offset)))
|
||||
;;
|
||||
(deftransform ,rm-aref ((array index) (simple-array t) * :important t)
|
||||
,(format nil "open-code ~A" rm-aref)
|
||||
(multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array)
|
||||
(declare (ignorable gap))
|
||||
`(,',reader/ix-vop (array-data-expr array ,is-vector)
|
||||
,,index-expression
|
||||
,step ,+vector-data-fixup+)))
|
||||
;; AREF
|
||||
(export ',aref)
|
||||
(defknown ,aref (array &rest index) ,rtype (foldable flushable))
|
||||
(defun ,aref (array &rest indices)
|
||||
(declare (truly-dynamic-extent indices))
|
||||
(with-sse-data ((sap data array)
|
||||
(offset (%array-row-major-index array indices)))
|
||||
(,reader-vop sap offset)))
|
||||
;;
|
||||
(defoptimizer (,aref derive-type) ((array &rest indices) node)
|
||||
(assert-array-rank array (length indices))
|
||||
(values-specifier-type ',rtype))
|
||||
(deftransform ,aref ((array &rest indices) (simple-array &rest t) * :important t)
|
||||
,(format nil "open-code ~A" aref)
|
||||
(multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array)
|
||||
(declare (ignorable gap))
|
||||
(let ((syms (make-gensym-list (length indices))))
|
||||
`(lambda (array ,@syms)
|
||||
(let ((index ,(if (eq is-vector :yes) (first syms)
|
||||
`(array-row-major-index array ,@syms))))
|
||||
(,',reader/ix-vop (array-data-expr array ,is-vector)
|
||||
,,index-expression
|
||||
,step ,+vector-data-fixup+))))))
|
||||
,@(if writer
|
||||
`(;; ROW-MAJOR-ASET
|
||||
(defknown ,rm-aset (array index sse-pack) ,rtype (unsafe))
|
||||
(defsetf ,rm-aref ,rm-aset)
|
||||
(defun ,rm-aset (array index new-value)
|
||||
(with-sse-data ((sap data array)
|
||||
(offset index))
|
||||
(,writer-vop sap offset (the ,rtype new-value))
|
||||
new-value))
|
||||
;;
|
||||
(deftransform ,rm-aset ((array index value) (simple-array t t) * :important t)
|
||||
,(format nil "open-code ~A" rm-aset)
|
||||
(multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array)
|
||||
(declare (ignorable gap))
|
||||
`(progn
|
||||
(,',writer/ix-vop (array-data-expr array ,is-vector)
|
||||
,,index-expression
|
||||
,step ,+vector-data-fixup+
|
||||
(the sse-pack value))
|
||||
value)))
|
||||
;; %ASET
|
||||
(defknown ,aset (array &rest t) ,rtype (unsafe))
|
||||
(defsetf ,aref ,aset)
|
||||
(defun ,aset (array &rest stuff)
|
||||
(let ((new-value (car (last stuff))))
|
||||
(with-sse-data ((sap data array)
|
||||
(offset (%array-row-major-index array (nbutlast stuff))))
|
||||
(,writer-vop sap offset (the ,rtype new-value))
|
||||
new-value)))
|
||||
;;
|
||||
(defoptimizer (,aset derive-type) ((array &rest stuff) node)
|
||||
(assert-array-rank array (1- (length stuff)))
|
||||
(assert-lvar-type (car (last stuff)) (specifier-type 'sse-pack)
|
||||
(lexenv-policy (node-lexenv node)))
|
||||
(specifier-type ',rtype))
|
||||
(deftransform ,aset ((array &rest stuff) (simple-array &rest t) * :important t)
|
||||
,(format nil "open-code ~A" aset)
|
||||
(multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array)
|
||||
(declare (ignorable gap))
|
||||
(let ((syms (make-gensym-list (length stuff))))
|
||||
`(lambda (array ,@syms)
|
||||
(let ((index (the index ,(if (eq is-vector :yes) (first syms)
|
||||
`(array-row-major-index array ,@(butlast syms))))))
|
||||
(,',writer/ix-vop (array-data-expr array ,is-vector)
|
||||
,,index-expression
|
||||
,step ,+vector-data-fixup+
|
||||
(the sse-pack ,(car (last syms)))))
|
||||
,(car (last syms)))))))))))
|
||||
|
||||
557
contrib/cl-simd/sbcl-core.lisp
Normal file
557
contrib/cl-simd/sbcl-core.lisp
Normal file
|
|
@ -0,0 +1,557 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
|
||||
;;;
|
||||
;;; This file contains definitions of abstract VOPs, macros
|
||||
;;; and utility functions used to implement the intrinsics.
|
||||
;;;
|
||||
|
||||
(in-package #:SSE)
|
||||
|
||||
;;; The specific pack types
|
||||
|
||||
(deftype int-sse-pack () '(sse-pack integer))
|
||||
(deftype float-sse-pack () '(sse-pack single-float))
|
||||
(deftype double-sse-pack () '(sse-pack double-float))
|
||||
|
||||
;;; Helper functions
|
||||
|
||||
(defconstant +uint32-mask+ #xFFFFFFFF)
|
||||
(defconstant +uint64-mask+ #xFFFFFFFFFFFFFFFF)
|
||||
(defconstant +min-int32+ (- (ash 1 31)))
|
||||
(defconstant +max-int32+ (1- (ash 1 31)))
|
||||
|
||||
(defun type-name-to-primitive (lt)
|
||||
(primitive-type-name (primitive-type (specifier-type lt))))
|
||||
|
||||
(defun move-cmd-for-type (lt)
|
||||
(ecase lt
|
||||
(int-sse-pack 'movdqa)
|
||||
((float-sse-pack double-sse-pack) 'movaps)))
|
||||
|
||||
(defun ensure-reg-or-mem (tn)
|
||||
(sc-case tn
|
||||
((sse-pack-immediate immediate)
|
||||
(register-inline-constant (tn-value tn)))
|
||||
(t tn)))
|
||||
|
||||
(defmacro ensure-load (type tgt src)
|
||||
`(unless (location= ,tgt ,src)
|
||||
(inst ,(move-cmd-for-type type) ,tgt (ensure-reg-or-mem ,src))))
|
||||
|
||||
(defmacro ensure-move (type tgt src)
|
||||
`(unless (location= ,tgt ,src)
|
||||
(inst ,(move-cmd-for-type type) ,tgt ,src)))
|
||||
|
||||
(defmacro save-intrinsic-spec (name info)
|
||||
`(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||
(setf (get ',name 'intrinsic-spec) ',info)))
|
||||
|
||||
(defmacro def-splice-transform (name args &body code)
|
||||
(let* ((direct-args (mapcar (lambda (x) (if (consp x) (gensym) x)) args))
|
||||
(flat-args (mapcan (lambda (x) (if (consp x) (copy-list (rest x)) (list x))) args)))
|
||||
`(deftransform ,name ((,@direct-args) * *)
|
||||
,(format nil "Simplify combination ~A" (cons name args))
|
||||
,@(loop for spec in args and name in direct-args
|
||||
when (consp spec)
|
||||
collect `(splice-fun-args ,name ',(first spec) ,(1- (length spec))))
|
||||
(list* 'lambda ',flat-args ',code))))
|
||||
|
||||
;;; Index-offset splicing
|
||||
|
||||
(defun fold-index-addressing (fun-name index scale &key setter-p)
|
||||
(multiple-value-bind (func index-args) (extract-fun-args index '(+ - * ash) 2)
|
||||
(destructuring-bind (x constant) index-args
|
||||
(declare (ignorable x))
|
||||
(unless (constant-lvar-p constant)
|
||||
(give-up-ir1-transform))
|
||||
(let ((value (lvar-value constant))
|
||||
(scale-value (lvar-value scale)))
|
||||
(case func
|
||||
(* (unless (typep (* value scale-value) '(signed-byte 32))
|
||||
(give-up-ir1-transform "constant is too large for inlining")))
|
||||
(ash (unless (and (>= value 0)
|
||||
(typep (ash scale-value value) '(signed-byte 32)))
|
||||
(give-up-ir1-transform "index shift is unsuitable for inlining"))))
|
||||
(splice-fun-args index func 2)
|
||||
(let* ((value-arg (when setter-p '(value)))
|
||||
(is-scale (member func '(* ash)))
|
||||
(new-scale (if is-scale `(,func scale const) 'scale))
|
||||
(new-offset (if is-scale 'offset `(,func offset (* const scale)))))
|
||||
`(lambda (thing index const scale offset ,@value-arg)
|
||||
(,fun-name thing index ,new-scale ,new-offset ,@value-arg)))))))
|
||||
|
||||
;;; Index-offset addressing
|
||||
|
||||
(defun is-tagged-load-scale (value)
|
||||
(not (logtest value (1- (ash 1 n-fixnum-tag-bits)))))
|
||||
|
||||
(deftype tagged-load-scale ()
|
||||
'(and fixnum (satisfies is-tagged-load-scale)))
|
||||
|
||||
(defun find-lea-scale (scale)
|
||||
(cond ((not (logtest scale 7)) (values (/ scale 8) 8))
|
||||
((not (logtest scale 3)) (values (/ scale 4) 4))
|
||||
((not (logtest scale 1)) (values (/ scale 2) 2))
|
||||
(t (values scale 1))))
|
||||
|
||||
(defun reduce-offset (ioffset scale offset)
|
||||
"Redistribute value from ioffset to offset, while keeping offset int32."
|
||||
(let* ((istep (if (< ioffset 0) -1 1))
|
||||
(icount (max 0
|
||||
(if (< ioffset 0)
|
||||
(- (1+ +min-int32+) ioffset) ; = (- +max-int32+)
|
||||
(- ioffset +max-int32+))))
|
||||
(ostep (* istep scale))
|
||||
(ocount (truncate (- (if (> ostep 0) +max-int32+ +min-int32+) offset)
|
||||
ostep))
|
||||
(count (min ocount icount)))
|
||||
(values (- ioffset (* count istep))
|
||||
(+ offset (* count ostep)))))
|
||||
|
||||
(defun split-offset (offset scale)
|
||||
(if (typep offset '(signed-byte 32))
|
||||
(values 0 offset)
|
||||
(multiple-value-bind (div rem) (floor offset scale)
|
||||
(assert (typep rem '(signed-byte 32)))
|
||||
(if (typep div '(signed-byte 32))
|
||||
(values div rem)
|
||||
(reduce-offset div scale rem)))))
|
||||
|
||||
(defun power-of-2? (scale)
|
||||
(and (> scale 0) (not (logtest scale (1- scale)))))
|
||||
|
||||
(defun find-power-of-2 (scale)
|
||||
(assert (power-of-2? scale))
|
||||
(loop for i from 0 and sv = scale then (ash sv -1)
|
||||
when (<= sv 1) return i))
|
||||
|
||||
(defun make-scaled-ea (size sap index scale offset tmp &key fixnum-index)
|
||||
"Returns an ea representing the given index*scale + offset formula.
|
||||
May emit additional instructions using the temporary register."
|
||||
(assemble ()
|
||||
(if (or (sc-is index immediate) (= scale 0))
|
||||
;; Fully constant offset
|
||||
(let ((value (if (= scale 0) offset
|
||||
(+ (* (tn-value index) scale) offset))))
|
||||
(assert (typep value '(signed-byte 64)))
|
||||
(if (typep value '(signed-byte 32))
|
||||
(make-ea size :base sap :disp value)
|
||||
(progn
|
||||
(inst mov tmp (register-inline-constant value))
|
||||
(make-ea size :base sap :index tmp))))
|
||||
;; Indexing
|
||||
(progn
|
||||
(when (sc-is index any-reg)
|
||||
(assert (and fixnum-index (is-tagged-load-scale scale)))
|
||||
(setf scale (ash scale (- n-fixnum-tag-bits))))
|
||||
(multiple-value-bind (rscale lscale) (find-lea-scale scale)
|
||||
;; One-instruction case?
|
||||
(if (and (= rscale 1) (typep offset '(signed-byte 32)))
|
||||
(make-ea size :base sap :index index :scale scale :disp offset)
|
||||
;; Use temporary
|
||||
(multiple-value-bind (roffset loffset) (split-offset offset lscale)
|
||||
(labels ((negate-when-<0 (register scale)
|
||||
(when (< scale 0)
|
||||
(inst neg register)))
|
||||
(emit-shift-mul (register scale)
|
||||
(inst shl register (find-power-of-2 (abs scale)))
|
||||
(negate-when-<0 register scale))
|
||||
;; Tries to compute tmp via LEA
|
||||
(try-use-lea (scale &optional base)
|
||||
(multiple-value-bind (rrscale rlscale) (find-lea-scale scale)
|
||||
(when (and (= (abs rrscale) 1) (typep (* rrscale roffset) '(signed-byte 32)))
|
||||
(when (and (= roffset 0) (null base)) ; minimize loffset
|
||||
(multiple-value-setq (roffset loffset) (floor offset lscale)))
|
||||
(let ((xoffset (* rrscale roffset)))
|
||||
(inst lea tmp
|
||||
(if (and (= rlscale 1) (null base))
|
||||
(make-ea :byte :base index :disp xoffset)
|
||||
(make-ea :byte :base base :index index :scale rlscale :disp xoffset))))
|
||||
(negate-when-<0 tmp rrscale)
|
||||
:success))))
|
||||
(declare (inline negate-when-<0 emit-shift-mul))
|
||||
(cond
|
||||
;; same register shift?
|
||||
((and (= roffset 0) (location= tmp index) (power-of-2? (abs rscale)))
|
||||
(emit-shift-mul tmp rscale))
|
||||
;; one LEA?
|
||||
((try-use-lea rscale))
|
||||
((try-use-lea (1- rscale) index))
|
||||
;; Generic case, use mul/shl and add
|
||||
(t
|
||||
(if (power-of-2? (abs rscale))
|
||||
(progn
|
||||
(move tmp index)
|
||||
(emit-shift-mul tmp rscale))
|
||||
(inst imul tmp index rscale))
|
||||
(unless (= roffset 0)
|
||||
;; Make loffset as small as possible
|
||||
(multiple-value-setq (roffset loffset) (floor offset lscale))
|
||||
(if (typep roffset '(signed-byte 32))
|
||||
(inst add tmp roffset)
|
||||
(inst add tmp (register-inline-constant roffset))))))
|
||||
(make-ea size :base sap :index tmp :scale lscale :disp loffset)))))))))
|
||||
|
||||
;; Initialization
|
||||
|
||||
(defmacro def-float-set-intrinsic (&whole whole pubname fname atype aregtype rtype move)
|
||||
(declare (ignore pubname))
|
||||
`(progn
|
||||
(save-intrinsic-spec ,fname ,whole)
|
||||
(defknown ,fname (,atype) ,rtype (foldable flushable))
|
||||
(define-vop (,fname)
|
||||
(:translate ,fname)
|
||||
(:args (arg :scs (,aregtype) :target dst))
|
||||
(:arg-types ,atype)
|
||||
(:results (dst :scs (sse-reg)))
|
||||
(:result-types ,(type-name-to-primitive rtype))
|
||||
(:policy :fast-safe)
|
||||
(:generator 1
|
||||
(unless (location= dst arg)
|
||||
(inst ,move dst arg))))))
|
||||
|
||||
;; Unary operations
|
||||
|
||||
(define-vop (sse-unary-base-op)
|
||||
;; no immediate because expecting to be folded
|
||||
(:args (x :scs (sse-reg)))
|
||||
(:arg-types sse-pack)
|
||||
(:policy :fast-safe)
|
||||
(:note "inline SSE unary operation")
|
||||
(:vop-var vop)
|
||||
(:save-p :compute-only))
|
||||
|
||||
(define-vop (sse-unary-op sse-unary-base-op)
|
||||
(:args (x :scs (sse-reg) :target r))
|
||||
(:results (r :scs (sse-reg))))
|
||||
|
||||
(define-vop (sse-unary-to-int-op sse-unary-base-op)
|
||||
(:results (r :scs (signed-reg))))
|
||||
|
||||
(define-vop (sse-unary-to-uint-op sse-unary-base-op)
|
||||
(:results (r :scs (unsigned-reg))))
|
||||
|
||||
(defmacro def-unary-intrinsic (&whole whole name rtype insn cost c-name &key partial immediate-arg result-size arg-type)
|
||||
(declare (ignore c-name arg-type))
|
||||
(let* ((imm (if immediate-arg '(imm)))
|
||||
(immt (if immediate-arg (list immediate-arg))))
|
||||
(assert (or (not partial) (not (subtypep rtype 'integer))))
|
||||
`(progn
|
||||
(export ',name)
|
||||
(save-intrinsic-spec ,name ,whole)
|
||||
(defknown ,name (sse-pack ,@immt) ,rtype (foldable flushable))
|
||||
(define-vop (,name ,(cond ((subtypep rtype 'unsigned-byte)
|
||||
'sse-unary-to-uint-op)
|
||||
((subtypep rtype 'integer)
|
||||
'sse-unary-to-int-op)
|
||||
(t 'sse-unary-op)))
|
||||
(:translate ,name)
|
||||
(:result-types ,(type-name-to-primitive rtype))
|
||||
,@(if immediate-arg
|
||||
`((:arg-types sse-pack (:constant ,immediate-arg))
|
||||
(:info imm)))
|
||||
(:generator ,cost
|
||||
,@(ecase partial
|
||||
(:one-arg `((ensure-move ,rtype r x)
|
||||
(inst ,insn r ,@imm)))
|
||||
(t `((ensure-move ,rtype r x)
|
||||
(inst ,insn r r ,@imm)))
|
||||
((nil) `((inst ,insn
|
||||
,(if result-size `(reg-in-size r ,result-size) 'r)
|
||||
x ,@imm)))))))))
|
||||
|
||||
;; Unary to int32 & sign-extend
|
||||
|
||||
(define-vop (sse-cvt-to-int32-op sse-unary-base-op)
|
||||
(:temporary (:sc signed-reg :offset rax-offset :target r :to :result) rax)
|
||||
(:results (r :scs (signed-reg))))
|
||||
|
||||
(defmacro def-cvt-to-int32-intrinsic (name rtype insn cost c-name &key arg-type)
|
||||
(declare (ignore arg-type))
|
||||
`(progn
|
||||
(export ',name)
|
||||
(save-intrinsic-spec ,name (def-unary-intrinsic ,name ,rtype ,insn ,cost ,c-name))
|
||||
(defknown ,name (sse-pack) (signed-byte 32) (foldable flushable))
|
||||
(define-vop (,name sse-cvt-to-int32-op)
|
||||
(:translate ,name)
|
||||
(:result-types ,(type-name-to-primitive rtype))
|
||||
(:generator ,cost
|
||||
(inst ,insn (reg-in-size rax :dword) x)
|
||||
(inst cdqe)
|
||||
(move r rax)))))
|
||||
|
||||
;; NOT intrinsics
|
||||
|
||||
(define-vop (sse-not-op sse-unary-op)
|
||||
(:temporary (:sc sse-reg) tmp))
|
||||
|
||||
(defmacro def-not-intrinsic (name rtype insn)
|
||||
`(progn
|
||||
(export ',name)
|
||||
(save-intrinsic-spec ,name (def-unary-intrinsic ,name ,rtype ,insn 3 nil))
|
||||
(defknown ,name (sse-pack) ,rtype (foldable flushable))
|
||||
(define-vop (,name sse-not-op)
|
||||
(:translate ,name)
|
||||
(:result-types ,(type-name-to-primitive rtype))
|
||||
(:generator 3
|
||||
(if (location= x r)
|
||||
(progn
|
||||
(inst pcmpeqd tmp tmp)
|
||||
(inst ,insn r tmp))
|
||||
(progn
|
||||
(inst pcmpeqd r r)
|
||||
(inst ,insn r x)))))))
|
||||
|
||||
;; Binary operations
|
||||
|
||||
(define-vop (sse-binary-base-op)
|
||||
(:args (x :scs (sse-reg sse-pack-immediate) :target r)
|
||||
(y :scs (sse-reg sse-pack-immediate)))
|
||||
(:results (r :scs (sse-reg)))
|
||||
(:arg-types sse-pack sse-pack)
|
||||
(:policy :fast-safe)
|
||||
(:note "inline SSE binary operation")
|
||||
(:vop-var vop)
|
||||
(:save-p :compute-only))
|
||||
|
||||
(define-vop (sse-binary-op sse-binary-base-op)
|
||||
(:temporary (:sc sse-reg) tmp))
|
||||
|
||||
(define-vop (sse-binary-comm-op sse-binary-base-op)
|
||||
(:args (x :scs (sse-reg sse-pack-immediate) :target r)
|
||||
(y :scs (sse-reg sse-pack-immediate) :target r)))
|
||||
|
||||
(defmacro def-binary-intrinsic (&whole whole name rtype insn cost c-name &key commutative tags immediate-arg x-type y-type)
|
||||
(declare (ignore c-name x-type y-type))
|
||||
(let* ((imm (if immediate-arg '(imm)))
|
||||
(immt (if immediate-arg (list immediate-arg))))
|
||||
`(progn
|
||||
(export ',name)
|
||||
(save-intrinsic-spec ,name ,whole)
|
||||
(defknown ,name (sse-pack sse-pack ,@immt) ,rtype (foldable flushable))
|
||||
(define-vop (,name ,(if commutative 'sse-binary-comm-op 'sse-binary-op))
|
||||
(:translate ,name)
|
||||
(:result-types ,(type-name-to-primitive rtype))
|
||||
,@(if immediate-arg
|
||||
`((:arg-types sse-pack sse-pack (:constant ,immediate-arg))
|
||||
(:info imm)))
|
||||
(:generator ,cost
|
||||
,@(if commutative
|
||||
`((when (location= y r)
|
||||
(rotatef x y))
|
||||
(ensure-load ,rtype r x)
|
||||
(inst ,insn ,@tags r (ensure-reg-or-mem y) ,@imm))
|
||||
`((unless (location= y r)
|
||||
(setf tmp r))
|
||||
(ensure-load ,rtype tmp x)
|
||||
(inst ,insn ,@tags tmp (ensure-reg-or-mem y) ,@imm)
|
||||
(ensure-move ,rtype r tmp))))))))
|
||||
|
||||
;;; XMM/Integer combination intrinsics
|
||||
|
||||
(define-vop (sse-int-base-op)
|
||||
(:results (r :scs (sse-reg)))
|
||||
(:policy :fast-safe)
|
||||
(:note "inline SSE/integer operation")
|
||||
(:vop-var vop)
|
||||
(:save-p :compute-only))
|
||||
|
||||
(define-vop (sse-int-op sse-int-base-op)
|
||||
(:args (x :scs (sse-reg sse-pack-immediate) :target r)
|
||||
(iv :scs (signed-reg signed-stack immediate)))
|
||||
(:arg-types sse-pack signed-num))
|
||||
|
||||
(define-vop (sse-uint-op sse-int-base-op)
|
||||
(:args (x :scs (sse-reg sse-pack-immediate) :target r)
|
||||
(iv :scs (unsigned-reg unsigned-stack immediate)))
|
||||
(:arg-types sse-pack unsigned-num))
|
||||
|
||||
(defmacro def-sse-int-intrinsic (&whole whole name itype rtype insn cost c-name &key make-temporary immediate-arg)
|
||||
(declare (ignore c-name))
|
||||
(let* ((imm (if immediate-arg '(imm)))
|
||||
(immt (if immediate-arg (list immediate-arg)))
|
||||
(unsigned? (subtypep itype 'unsigned-byte)))
|
||||
`(progn
|
||||
(export ',name)
|
||||
(save-intrinsic-spec ,name ,whole)
|
||||
(defknown ,name (sse-pack ,itype ,@immt) ,rtype (foldable flushable))
|
||||
(define-vop (,name ,(if unsigned? 'sse-uint-op 'sse-int-op))
|
||||
(:translate ,name)
|
||||
(:result-types ,(type-name-to-primitive rtype))
|
||||
,@(if immediate-arg
|
||||
`((:arg-types sse-pack
|
||||
,(if unsigned? 'unsigned-num 'signed-num)
|
||||
(:constant ,immediate-arg))
|
||||
(:info imm)))
|
||||
,@(if make-temporary
|
||||
`((:temporary (:sc sse-reg) tmp)))
|
||||
(:generator ,cost
|
||||
(ensure-load ,rtype r x)
|
||||
,@(if (eq make-temporary t)
|
||||
'((inst movd tmp (ensure-reg-or-mem iv)))
|
||||
make-temporary)
|
||||
(inst ,insn r ,(if make-temporary 'tmp '(ensure-reg-or-mem iv)) ,@imm))))))
|
||||
|
||||
;;; Memory intrinsics
|
||||
|
||||
(define-vop (sse-load-base-op)
|
||||
(:results (r :scs (sse-reg)))
|
||||
(:policy :fast-safe)
|
||||
(:note "inline SSE load operation"))
|
||||
|
||||
(define-vop (sse-load-op sse-load-base-op)
|
||||
(:args (sap :scs (sap-reg))
|
||||
(offset :scs (signed-reg)))
|
||||
(:arg-types system-area-pointer signed-num))
|
||||
|
||||
(define-vop (sse-xmm-load-op sse-load-base-op)
|
||||
(:args (value :scs (sse-reg sse-pack-immediate) :target r)
|
||||
(sap :scs (sap-reg))
|
||||
(offset :scs (signed-reg)))
|
||||
(:arg-types sse-pack system-area-pointer signed-num))
|
||||
|
||||
(define-vop (sse-load-imm-op sse-load-base-op)
|
||||
(:args (sap :scs (sap-reg)))
|
||||
(:arg-types system-area-pointer
|
||||
(:constant (signed-byte 32)))
|
||||
(:info offset))
|
||||
|
||||
(define-vop (sse-xmm-load-imm-op sse-load-base-op)
|
||||
(:args (value :scs (sse-reg sse-pack-immediate) :target r)
|
||||
(sap :scs (sap-reg)))
|
||||
(:arg-types sse-pack system-area-pointer
|
||||
(:constant (signed-byte 32)))
|
||||
(:info offset))
|
||||
|
||||
(define-vop (sse-load-ix-op sse-load-base-op)
|
||||
(:args (sap :scs (descriptor-reg) :to :eval)
|
||||
(index :scs (signed-reg immediate) :target tmp))
|
||||
(:arg-types * signed-num (:constant fixnum) (:constant fixnum))
|
||||
(:temporary (:sc signed-reg :from (:argument 1)) tmp)
|
||||
(:info scale offset))
|
||||
|
||||
(define-vop (sse-load-ix-op/tag sse-load-base-op)
|
||||
(:args (sap :scs (descriptor-reg) :to :eval)
|
||||
(index :scs (any-reg signed-reg immediate) :target tmp))
|
||||
(:arg-types * tagged-num (:constant tagged-load-scale) (:constant fixnum))
|
||||
(:temporary (:sc any-reg :from (:argument 1)) tmp)
|
||||
(:info scale offset))
|
||||
|
||||
(defmacro def-load-intrinsic (&whole whole name rtype insn c-name
|
||||
&key register-arg tags postfix-fmt (size :qword))
|
||||
(declare (ignore c-name postfix-fmt))
|
||||
(let* ((vop (symbolicate "%" name))
|
||||
(c-vop (symbolicate vop "-C"))
|
||||
(ix-vop (symbolicate vop "/IX"))
|
||||
(valtype (if register-arg '(sse-pack)))
|
||||
(valarg (if register-arg '(value)))
|
||||
(r-arg (if rtype '(r)))
|
||||
(rtypes (if rtype
|
||||
`(:result-types ,(type-name-to-primitive rtype))
|
||||
`(:results))))
|
||||
(assert (or rtype (not register-arg)))
|
||||
`(progn
|
||||
(export ',name)
|
||||
(save-intrinsic-spec ,name ,whole)
|
||||
(defknown ,vop (,@valtype system-area-pointer fixnum) ,(or rtype '(values)) (flushable always-translatable))
|
||||
(define-vop (,vop ,(if register-arg 'sse-xmm-load-op 'sse-load-op))
|
||||
(:translate ,vop)
|
||||
,rtypes
|
||||
(:generator 5
|
||||
,(if register-arg `(ensure-load ,rtype r value))
|
||||
(inst ,insn ,@tags ,@r-arg (make-ea ,size :base sap :index offset))))
|
||||
(define-vop (,c-vop ,(if register-arg 'sse-xmm-load-imm-op 'sse-load-imm-op))
|
||||
(:translate ,vop)
|
||||
,rtypes
|
||||
(:generator 4
|
||||
,(if register-arg `(ensure-load ,rtype r value))
|
||||
(inst ,insn ,@tags ,@r-arg (make-ea ,size :base sap :disp offset))))
|
||||
(def-splice-transform ,vop (,@valarg (sap+ sap offset1) offset2)
|
||||
(,vop ,@valarg sap (+ offset1 offset2)))
|
||||
,@(if (null register-arg)
|
||||
`(;; Vector indexing version
|
||||
(defknown ,ix-vop (simple-array fixnum fixnum fixnum) ,(or rtype '(values))
|
||||
(flushable always-translatable))
|
||||
(define-vop (,ix-vop sse-load-ix-op)
|
||||
(:translate ,ix-vop)
|
||||
,rtypes
|
||||
(:generator 4
|
||||
(inst ,insn ,@tags ,@r-arg (make-scaled-ea ,size sap index scale offset tmp))))
|
||||
(define-vop (,(symbolicate ix-vop "/TAG") sse-load-ix-op/tag)
|
||||
(:translate ,ix-vop)
|
||||
,rtypes
|
||||
(:generator 3
|
||||
(inst ,insn ,@tags ,@r-arg (make-scaled-ea ,size sap index scale offset tmp :fixnum-index t))))
|
||||
(deftransform ,ix-vop ((thing index scale offset))
|
||||
"fold semi-constant index expressions"
|
||||
(fold-index-addressing ',ix-vop index scale)))))))
|
||||
|
||||
(define-vop (sse-store-base-op)
|
||||
(:policy :fast-safe)
|
||||
(:note "inline SSE store operation"))
|
||||
|
||||
(define-vop (sse-store-op sse-store-base-op)
|
||||
(:args (sap :scs (sap-reg))
|
||||
(offset :scs (signed-reg))
|
||||
(value :scs (sse-reg)))
|
||||
(:arg-types system-area-pointer signed-num sse-pack))
|
||||
|
||||
(define-vop (sse-store-imm-op sse-store-base-op)
|
||||
(:args (sap :scs (sap-reg))
|
||||
(value :scs (sse-reg)))
|
||||
(:arg-types system-area-pointer
|
||||
(:constant (signed-byte 32))
|
||||
sse-pack)
|
||||
(:info offset))
|
||||
|
||||
(define-vop (sse-store-ix-op sse-store-base-op)
|
||||
(:args (sap :scs (descriptor-reg) :to :eval)
|
||||
(index :scs (signed-reg immediate) :target tmp)
|
||||
(value :scs (sse-reg)))
|
||||
(:arg-types * signed-num (:constant fixnum) (:constant fixnum) sse-pack)
|
||||
(:temporary (:sc signed-reg :from (:argument 1)) tmp)
|
||||
(:info scale offset))
|
||||
|
||||
(define-vop (sse-store-ix-op/tag sse-store-base-op)
|
||||
(:args (sap :scs (descriptor-reg) :to :eval)
|
||||
(index :scs (any-reg signed-reg immediate) :target tmp)
|
||||
(value :scs (sse-reg)))
|
||||
(:arg-types * tagged-num (:constant tagged-load-scale) (:constant fixnum) sse-pack)
|
||||
(:temporary (:sc any-reg :from (:argument 1)) tmp)
|
||||
(:info scale offset))
|
||||
|
||||
(defmacro def-store-intrinsic (&whole whole name rtype insn c-name &key setf-name)
|
||||
(declare (ignore rtype c-name))
|
||||
(let* ((vop (symbolicate "%" name))
|
||||
(c-vop (symbolicate vop "-C"))
|
||||
(ix-vop (symbolicate vop "/IX")))
|
||||
`(progn
|
||||
,(unless setf-name `(export ',name))
|
||||
(save-intrinsic-spec ,name ,whole)
|
||||
(defknown ,vop (system-area-pointer fixnum sse-pack) (values) (unsafe always-translatable))
|
||||
(define-vop (,vop sse-store-op)
|
||||
(:translate ,vop)
|
||||
(:generator 5
|
||||
(inst ,insn (make-ea :qword :base sap :index offset) value)))
|
||||
(define-vop (,c-vop sse-store-imm-op)
|
||||
(:translate ,vop)
|
||||
(:generator 4
|
||||
(inst ,insn (make-ea :qword :base sap :disp offset) value)))
|
||||
(def-splice-transform ,vop ((sap+ sap offset1) offset2 new-value)
|
||||
(,vop sap (+ offset1 offset2) new-value))
|
||||
;; Vector indexing version
|
||||
(defknown ,ix-vop (simple-array fixnum fixnum fixnum sse-pack) (values)
|
||||
(unsafe always-translatable))
|
||||
(define-vop (,ix-vop sse-store-ix-op)
|
||||
(:translate ,ix-vop)
|
||||
(:generator 4
|
||||
(inst ,insn (make-scaled-ea :qword sap index scale offset tmp) value)))
|
||||
(define-vop (,(symbolicate ix-vop "/TAG") sse-store-ix-op/tag)
|
||||
(:translate ,ix-vop)
|
||||
(:generator 3
|
||||
(inst ,insn (make-scaled-ea :qword sap index scale offset tmp :fixnum-index t) value)))
|
||||
(deftransform ,ix-vop ((thing index scale offset value))
|
||||
"fold semi-constant index expressions"
|
||||
(fold-index-addressing ',ix-vop index scale :setter-p t)))))
|
||||
|
||||
454
contrib/cl-simd/sbcl-functions.lisp
Normal file
454
contrib/cl-simd/sbcl-functions.lisp
Normal file
|
|
@ -0,0 +1,454 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
|
||||
;;;
|
||||
;;; This file implements VOP-wrapping functions and non-primitive
|
||||
;;; extensions to the core intrinsic set.
|
||||
;;;
|
||||
|
||||
(in-package #:SSE)
|
||||
|
||||
;;; Materialize the intrinsic functions.
|
||||
|
||||
;; Since VOPs are activated only on load, actual functions that
|
||||
;; wrap them have to be defined in a different file. This is a
|
||||
;; hack to generate the functions from the same macro invocations
|
||||
;; as the VOPS.
|
||||
|
||||
(macrolet ((def-float-set-intrinsic (pubname fname atype aregtype rtype move)
|
||||
(declare (ignore aregtype move))
|
||||
`(progn
|
||||
(defun ,fname (arg)
|
||||
(declare (type ,atype arg))
|
||||
(truly-the ,rtype (%primitive ,fname arg)))
|
||||
;; Public function - includes coercion
|
||||
(export ',pubname)
|
||||
(declaim (ftype (function (real) ,rtype) ,pubname)
|
||||
(inline ,pubname))
|
||||
(defun ,pubname (arg) (,fname (coerce arg ',atype)))))
|
||||
(def-unary-intrinsic (name rtype insn cost c-name &key immediate-arg &allow-other-keys)
|
||||
(declare (ignore insn cost c-name))
|
||||
(unless immediate-arg
|
||||
`(defun ,name (x)
|
||||
(declare (type sse-pack x))
|
||||
(truly-the ,rtype (%primitive ,name x)))))
|
||||
(def-binary-intrinsic (name rtype insn cost c-name &key immediate-arg &allow-other-keys)
|
||||
(declare (ignore insn cost c-name))
|
||||
(unless immediate-arg
|
||||
`(defun ,name (x y ,@(if immediate-arg '(imm)))
|
||||
(declare (type sse-pack x y))
|
||||
(truly-the ,rtype (%primitive ,name x y)))))
|
||||
(def-sse-int-intrinsic (name itype rtype insn cost c-name &key immediate-arg &allow-other-keys)
|
||||
(declare (ignore insn cost c-name))
|
||||
(unless immediate-arg
|
||||
`(defun ,name (x iv)
|
||||
(declare (type sse-pack x)
|
||||
(type ,itype iv))
|
||||
(truly-the ,rtype (%primitive ,name x iv)))))
|
||||
(def-load-intrinsic (name rtype insn c-name &key register-arg &allow-other-keys)
|
||||
(declare (ignore insn c-name))
|
||||
(let* ((vop (symbolicate "%" name))
|
||||
(valarg (if register-arg '(value))))
|
||||
`(progn
|
||||
(declaim (inline ,name))
|
||||
(defun ,name (,@valarg pointer &optional (offset 0))
|
||||
(declare ,@(if register-arg '((type sse-pack value)))
|
||||
(type system-area-pointer pointer)
|
||||
(type fixnum offset))
|
||||
,(if rtype
|
||||
`(truly-the ,rtype (,vop ,@valarg pointer offset))
|
||||
`(,vop ,@valarg pointer offset))))))
|
||||
(def-store-intrinsic (name rtype insn c-name &key setf-name &allow-other-keys)
|
||||
(declare (ignore insn c-name))
|
||||
(let* ((vop (symbolicate "%" name)))
|
||||
`(progn
|
||||
(declaim (inline ,name))
|
||||
(defun ,name (pointer value &optional (offset 0))
|
||||
(declare (type system-area-pointer pointer)
|
||||
(type sse-pack value)
|
||||
(type fixnum offset))
|
||||
(,vop pointer offset value)
|
||||
(truly-the ,rtype value))
|
||||
,(if setf-name
|
||||
`(defsetf ,setf-name (pointer &optional (offset 0)) (value)
|
||||
`(,',name ,pointer ,value ,offset)))))))
|
||||
;; Load the definition list
|
||||
#.(loop for name being each present-symbol
|
||||
when (get name 'intrinsic-spec)
|
||||
collect it into specs
|
||||
finally (return `(progn ,@specs))))
|
||||
|
||||
;;; Helper functions and macros
|
||||
|
||||
(defmacro def-utility (name args rtype &body code)
|
||||
`(progn
|
||||
(export ',name)
|
||||
(declaim (ftype (function ,(mapcar (constantly 'sse-pack) args) ,rtype) ,name)
|
||||
(inline ,name))
|
||||
(defun ,name ,args ,@code)))
|
||||
|
||||
(defmacro def-if-function (name rtype postfix)
|
||||
(let* ((not-x (symbolicate "NOT-" postfix))
|
||||
(or-x (symbolicate "OR-" postfix))
|
||||
(and-x (symbolicate "AND-" postfix))
|
||||
(andn-x (symbolicate "ANDNOT-" postfix))
|
||||
(xor-x (symbolicate "XOR-" postfix))
|
||||
(true (%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF))
|
||||
(false (%make-sse-pack 0 0)))
|
||||
`(progn
|
||||
(export ',name)
|
||||
(defknown ,name (sse-pack sse-pack sse-pack) ,rtype (foldable flushable))
|
||||
(defun ,name (condition true-val false-val)
|
||||
(,or-x (,and-x condition true-val)
|
||||
(,andn-x condition false-val)))
|
||||
;; Instead of inlining, use a transform so that the splice
|
||||
;; rule has a chance to apply. This depends on transform
|
||||
;; definitions behaving like a LIFO:
|
||||
(deftransform ,name ((condition true-val false-val) * *)
|
||||
"Expand the conditional."
|
||||
'(,or-x (,and-x condition true-val) (,andn-x condition false-val)))
|
||||
(def-splice-transform ,name ((,not-x cond) tv fv) (,name cond fv tv))
|
||||
;; NOT elimination and partial constant folding for bitwise ops:
|
||||
(def-splice-transform ,not-x ((,not-x arg1)) arg1)
|
||||
(def-splice-transform ,and-x (arg1 (,not-x arg2)) (,andn-x arg2 arg1))
|
||||
(def-splice-transform ,and-x ((,not-x arg1) arg2) (,andn-x arg1 arg2))
|
||||
(def-splice-transform ,andn-x ((,not-x arg1) arg2) (,and-x arg1 arg2))
|
||||
(%deftransform ',or-x '(function * *) #'commutative-arg-swap "place constant arg last")
|
||||
(%deftransform ',and-x '(function * *) #'commutative-arg-swap "place constant arg last")
|
||||
(%deftransform ',xor-x '(function * *) #'commutative-arg-swap "place constant arg last")
|
||||
(deftransform ,or-x ((arg1 arg2) (* (constant-arg (member ,true))) *) ,true)
|
||||
(deftransform ,or-x ((arg1 arg2) (* (constant-arg (member ,false))) *) 'arg1)
|
||||
(deftransform ,and-x ((arg1 arg2) (* (constant-arg (member ,true))) *) 'arg1)
|
||||
(deftransform ,and-x ((arg1 arg2) (* (constant-arg (member ,false))) *) ,false)
|
||||
(deftransform ,xor-x ((arg1 arg2) (* (constant-arg (member ,false))) *) 'arg1)
|
||||
(deftransform ,andn-x ((arg1 arg2) (* (constant-arg (member ,true))) *) 'arg1)
|
||||
(deftransform ,andn-x ((arg1 arg2) (* (constant-arg (member ,false))) *) ,false)
|
||||
(deftransform ,andn-x ((arg1 arg2) ((constant-arg (member ,true)) *) *) ,false)
|
||||
(deftransform ,andn-x ((arg1 arg2) ((constant-arg (member ,false)) *) *) 'arg2))))
|
||||
|
||||
(defmacro def-not-cmp-pairs (not-fun &rest pairs)
|
||||
`(progn
|
||||
,@(loop for (a b) on pairs by #'cddr
|
||||
collect `(def-splice-transform ,not-fun ((,a arg1 arg2)) (,b arg1 arg2))
|
||||
collect `(def-splice-transform ,not-fun ((,b arg1 arg2)) (,a arg1 arg2)))))
|
||||
|
||||
;;; CPU control
|
||||
|
||||
(defun cpu-mxcsr ()
|
||||
(cpu-mxcsr))
|
||||
|
||||
(defun %set-cpu-mxcsr (x)
|
||||
(declare (type (unsigned-byte 32) x))
|
||||
(%set-cpu-mxcsr x))
|
||||
|
||||
(defsetf cpu-mxcsr %set-cpu-mxcsr)
|
||||
|
||||
(defun cpu-load-fence () (cpu-load-fence))
|
||||
(defun cpu-store-fence () (cpu-store-fence))
|
||||
(defun cpu-memory-fence () (cpu-memory-fence))
|
||||
|
||||
(defun cpu-pause () (cpu-pause))
|
||||
|
||||
;;; Single-float
|
||||
|
||||
;; Constants
|
||||
|
||||
(define-symbol-macro 0.0-ps (truly-the float-sse-pack #.(%make-sse-pack 0 0)))
|
||||
|
||||
(define-symbol-macro true-ss (truly-the float-sse-pack #.(%make-sse-pack #xFFFFFFFF 0)))
|
||||
(define-symbol-macro true-ps (truly-the float-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF)))
|
||||
|
||||
(define-symbol-macro false-ss (truly-the float-sse-pack #.(%make-sse-pack 0 0)))
|
||||
(define-symbol-macro false-ps (truly-the float-sse-pack #.(%make-sse-pack 0 0)))
|
||||
|
||||
;; Initialization
|
||||
|
||||
(declaim (inline set1-ps set-ps setr-ps setzero-ps))
|
||||
|
||||
(defun set1-ps (val)
|
||||
(let ((valv (set-ss val)))
|
||||
(shuffle-ps valv valv 0)))
|
||||
|
||||
(defun set-ps (x3 x2 x1 x0)
|
||||
(movelh-ps (unpacklo-ps (set-ss x0) (set-ss x1))
|
||||
(unpacklo-ps (set-ss x2) (set-ss x3))))
|
||||
|
||||
(defun setr-ps (x0 x1 x2 x3)
|
||||
(movelh-ps (unpacklo-ps (set-ss x0) (set-ss x1))
|
||||
(unpacklo-ps (set-ss x2) (set-ss x3))))
|
||||
|
||||
(defun setzero-ps () 0.0-ps)
|
||||
|
||||
;; Arithmetic negation
|
||||
|
||||
(def-utility neg-ss (arg) float-sse-pack
|
||||
(xor-ps arg #.(%make-sse-pack #x80000000 0)))
|
||||
|
||||
(def-utility neg-ps (arg) float-sse-pack
|
||||
(xor-ps arg #.(%make-sse-pack #x8000000080000000 #x8000000080000000)))
|
||||
|
||||
;; Bitwise operations
|
||||
|
||||
(def-if-function if-ps float-sse-pack #:ps)
|
||||
|
||||
;; Comparisons
|
||||
|
||||
(def-utility >-ss (x y) float-sse-pack (<-ss y x))
|
||||
(def-utility >-ps (x y) float-sse-pack (<-ps y x))
|
||||
(def-utility >=-ss (x y) float-sse-pack (<=-ss y x))
|
||||
(def-utility >=-ps (x y) float-sse-pack (<=-ps y x))
|
||||
(def-utility />-ss (x y) float-sse-pack (/<-ss y x))
|
||||
(def-utility />-ps (x y) float-sse-pack (/<-ps y x))
|
||||
(def-utility />=-ss (x y) float-sse-pack (/<=-ss y x))
|
||||
(def-utility />=-ps (x y) float-sse-pack (/<=-ps y x))
|
||||
|
||||
(def-not-cmp-pairs not-ps
|
||||
=-ps /=-ps <-ps /<-ps <=-ps /<=-ps >-ps />-ps >=-ps />=-ps cmpord-ps cmpunord-ps)
|
||||
|
||||
;; Shuffle
|
||||
|
||||
(declaim (inline %sse-pack-to-int %int-to-sse-pack %shuffle-subints))
|
||||
|
||||
(defun %sse-pack-to-int (pack)
|
||||
(logior (%sse-pack-low pack) (ash (%sse-pack-high pack) 64)))
|
||||
|
||||
(defun %int-to-sse-pack (val &aux (mask #xFFFFFFFFFFFFFFFF))
|
||||
(%make-sse-pack (logand val mask) (logand (ash val -64) mask)))
|
||||
|
||||
(defun %shuffle-subints (xval yval imm bit-cnt &aux (mask (1- (ash 1 bit-cnt))))
|
||||
(flet ((bits (idx)
|
||||
(logand 3 (ash imm (* -2 idx))))
|
||||
(val (src idx)
|
||||
(logand mask (ash src (* (- bit-cnt) idx)))))
|
||||
(logior (val xval (bits 0))
|
||||
(ash (val xval (bits 1)) bit-cnt)
|
||||
(ash (val yval (bits 2)) (* 2 bit-cnt))
|
||||
(ash (val yval (bits 3)) (* 3 bit-cnt)))))
|
||||
|
||||
(defun shuffle-ps (x y imm)
|
||||
(declare (type sse-pack x y))
|
||||
(let* ((xval (%sse-pack-to-int x))
|
||||
(yval (%sse-pack-to-int y)))
|
||||
(truly-the float-sse-pack (%int-to-sse-pack (%shuffle-subints xval yval imm 32)))))
|
||||
|
||||
;;; Double-float
|
||||
|
||||
;; Constants
|
||||
|
||||
(define-symbol-macro 0.0-pd (truly-the double-sse-pack #.(%make-sse-pack 0 0)))
|
||||
|
||||
(define-symbol-macro true-sd (truly-the double-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF 0)))
|
||||
(define-symbol-macro true-pd (truly-the double-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF)))
|
||||
|
||||
(define-symbol-macro false-sd (truly-the double-sse-pack #.(%make-sse-pack 0 0)))
|
||||
(define-symbol-macro false-pd (truly-the double-sse-pack #.(%make-sse-pack 0 0)))
|
||||
|
||||
;; Initialization
|
||||
|
||||
(declaim (inline set1-pd set-pd setr-pd setzero-pd))
|
||||
|
||||
(defun set1-pd (val)
|
||||
(let ((valv (set-sd val)))
|
||||
(shuffle-pd valv valv 0)))
|
||||
|
||||
(defun set-pd (x1 x0)
|
||||
(unpacklo-pd (set-sd x0) (set-sd x1)))
|
||||
|
||||
(defun setr-pd (x0 x1)
|
||||
(unpacklo-pd (set-sd x0) (set-sd x1)))
|
||||
|
||||
(defun setzero-pd () 0.0-pd)
|
||||
|
||||
;; Arithmetic negation
|
||||
|
||||
(def-utility neg-sd (arg) double-sse-pack
|
||||
(xor-pd arg #.(%make-sse-pack #x8000000000000000 0)))
|
||||
|
||||
(def-utility neg-pd (arg) double-sse-pack
|
||||
(xor-pd arg #.(%make-sse-pack #x8000000000000000 #x8000000000000000)))
|
||||
|
||||
;; Bitwise operations
|
||||
|
||||
(def-if-function if-pd double-sse-pack #:pd)
|
||||
|
||||
;; Comparisons
|
||||
|
||||
(def-utility >-sd (x y) double-sse-pack (<-sd y x))
|
||||
(def-utility >-pd (x y) double-sse-pack (<-pd y x))
|
||||
(def-utility >=-sd (x y) double-sse-pack (<=-sd y x))
|
||||
(def-utility >=-pd (x y) double-sse-pack (<=-pd y x))
|
||||
(def-utility />-sd (x y) double-sse-pack (/<-sd y x))
|
||||
(def-utility />-pd (x y) double-sse-pack (/<-pd y x))
|
||||
(def-utility />=-sd (x y) double-sse-pack (/<=-sd y x))
|
||||
(def-utility />=-pd (x y) double-sse-pack (/<=-pd y x))
|
||||
|
||||
(def-not-cmp-pairs not-pd
|
||||
=-pd /=-pd <-pd /<-pd <=-pd /<=-pd >-pd />-pd >=-pd />=-pd cmpord-pd cmpunord-pd)
|
||||
|
||||
;; Shuffle
|
||||
|
||||
(defun shuffle-pd (x y imm)
|
||||
(declare (type sse-pack x y))
|
||||
(truly-the double-sse-pack
|
||||
(%make-sse-pack (if (logtest imm 1) (%sse-pack-high x) (%sse-pack-low x))
|
||||
(if (logtest imm 2) (%sse-pack-high y) (%sse-pack-low y)))))
|
||||
|
||||
;;; Integer
|
||||
|
||||
;; Constants
|
||||
|
||||
(define-symbol-macro 0-pi (truly-the int-sse-pack #.(%make-sse-pack 0 0)))
|
||||
|
||||
(define-symbol-macro true-pi (truly-the int-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF)))
|
||||
|
||||
(define-symbol-macro false-pi (truly-the int-sse-pack #.(%make-sse-pack 0 0)))
|
||||
|
||||
;; Initialization
|
||||
|
||||
(macrolet ((defset (name type)
|
||||
`(defun ,name (x)
|
||||
(declare (type ,type x))
|
||||
(,name x))))
|
||||
(defset %set-int (signed-byte 64))
|
||||
(defset %set-uint (unsigned-byte 64))
|
||||
(defset convert-si32-to-pi (signed-byte 32))
|
||||
(defset convert-su32-to-pi (unsigned-byte 32))
|
||||
(defset convert-si64-to-pi (signed-byte 64))
|
||||
(defset convert-su64-to-pi (unsigned-byte 64)))
|
||||
|
||||
(macrolet ((defset1 (name setter type shuffle &rest expands)
|
||||
`(progn
|
||||
(export ',name)
|
||||
(declaim (inline ,name))
|
||||
(defun ,name (arg)
|
||||
(let ((val (,setter (the ,type arg))))
|
||||
(declare (type int-sse-pack val))
|
||||
,@(loop for x in expands collect `(setq val (,x val val)))
|
||||
(shuffle-pi32 val ,shuffle))))))
|
||||
(defset1 set1-pi8 %set-int fixnum #4r0000 unpacklo-pi8 unpacklo-pi16)
|
||||
(defset1 set1-pi16 %set-int fixnum #4r0000 unpacklo-pi16)
|
||||
(defset1 set1-pi32 %set-int (signed-byte 32) #4r0000)
|
||||
(defset1 set1-pu32 %set-uint (unsigned-byte 32) #4r0000)
|
||||
(defset1 set1-pi64 %set-int (signed-byte 64) #4r1010)
|
||||
(defset1 set1-pu64 %set-uint (unsigned-byte 64) #4r1010))
|
||||
|
||||
(macrolet ((defset (name rname setter type depth)
|
||||
(let* ((names (loop for i from 0 below (ash 1 depth)
|
||||
collect (symbolicate (format nil "X~A" i))))
|
||||
(funcs #(unpacklo-pi64 unpacklo-pi32 unpacklo-pi16 unpacklo-pi8))
|
||||
(body (loop for i downfrom depth to 0
|
||||
for bv = (mapcar (lambda (x) `(,setter (the ,type ,x))) names)
|
||||
then (loop for (a b) on bv by #'cddr
|
||||
collect `(,(svref funcs i) ,a ,b))
|
||||
finally (return (first bv)))))
|
||||
`(progn
|
||||
(export ',name)
|
||||
(export ',rname)
|
||||
(declaim (inline ,name ,rname))
|
||||
(defun ,name (,@(reverse names)) ,body)
|
||||
(defun ,rname (,@names) ,body)))))
|
||||
(defset set-pi8 setr-pi8 %set-int fixnum 4)
|
||||
(defset set-pi16 setr-pi16 %set-int fixnum 3)
|
||||
(defset set-pi32 setr-pi32 %set-int (signed-byte 32) 2)
|
||||
(defset set-pu32 setr-pu32 %set-uint (unsigned-byte 32) 2)
|
||||
(defset set-pi64 setr-pi64 %set-int (signed-byte 64) 1)
|
||||
(defset set-pu64 setr-pu64 %set-uint (unsigned-byte 64) 1))
|
||||
|
||||
(declaim (inline setzero-pi))
|
||||
(defun setzero-pi () 0-pi)
|
||||
|
||||
;; Masked move
|
||||
|
||||
(export 'maskmoveu-pi)
|
||||
|
||||
(declaim (inline maskmoveu-pi))
|
||||
|
||||
(defun maskmoveu-pi (value mask pointer &optional (offset 0))
|
||||
(declare (type sse-pack value mask)
|
||||
(type system-area-pointer pointer)
|
||||
(type fixnum offset))
|
||||
(%maskmoveu-pi value mask pointer offset))
|
||||
|
||||
;; Arithmetic negation (subtract from 0)
|
||||
|
||||
(macrolet ((frob (name subf)
|
||||
`(def-utility ,name (arg) int-sse-pack (,subf 0-pi arg))))
|
||||
(frob neg-pi8 sub-pi8)
|
||||
(frob neg-pi16 sub-pi16)
|
||||
(frob neg-pi32 sub-pi32)
|
||||
(frob neg-pi64 sub-pi64))
|
||||
|
||||
;; Bitwise operations
|
||||
|
||||
(def-if-function if-pi int-sse-pack #:pi)
|
||||
|
||||
;; Comparisons
|
||||
|
||||
(def-utility <-pi8 (x y) int-sse-pack (>-pi8 y x))
|
||||
(def-utility <-pi16 (x y) int-sse-pack (>-pi16 y x))
|
||||
(def-utility <-pi32 (x y) int-sse-pack (>-pi32 y x))
|
||||
|
||||
(def-utility <=-pi8 (x y) int-sse-pack (not-pi (>-pi8 x y)))
|
||||
(def-utility <=-pi16 (x y) int-sse-pack (not-pi (>-pi16 x y)))
|
||||
(def-utility <=-pi32 (x y) int-sse-pack (not-pi (>-pi32 x y)))
|
||||
|
||||
(def-utility >=-pi8 (x y) int-sse-pack (not-pi (>-pi8 y x)))
|
||||
(def-utility >=-pi16 (x y) int-sse-pack (not-pi (>-pi16 y x)))
|
||||
(def-utility >=-pi32 (x y) int-sse-pack (not-pi (>-pi32 y x)))
|
||||
|
||||
(def-utility /=-pi8 (x y) int-sse-pack (not-pi (=-pi8 x y)))
|
||||
(def-utility /=-pi16 (x y) int-sse-pack (not-pi (=-pi16 x y)))
|
||||
(def-utility /=-pi32 (x y) int-sse-pack (not-pi (=-pi32 x y)))
|
||||
|
||||
;; Shifts
|
||||
|
||||
(defun slli-pi (x imm)
|
||||
(declare (type sse-pack x))
|
||||
(truly-the int-sse-pack
|
||||
(if (> imm 15)
|
||||
0-pi
|
||||
(%int-to-sse-pack (ash (%sse-pack-to-int x) (* 8 imm))))))
|
||||
|
||||
(defun srli-pi (x imm)
|
||||
(declare (type sse-pack x))
|
||||
(truly-the int-sse-pack
|
||||
(if (> imm 15)
|
||||
0-pi
|
||||
(%int-to-sse-pack (ash (%sse-pack-to-int x) (* -8 imm))))))
|
||||
|
||||
;; Extract & insert
|
||||
|
||||
(defun extract-pi16 (x imm)
|
||||
(declare (type sse-pack x))
|
||||
(logand #xFFFF
|
||||
(ash (%sse-pack-to-int x)
|
||||
(- (* 16 (logand imm 7))))))
|
||||
|
||||
(defun insert-pi16 (x intv imm)
|
||||
(declare (type sse-pack x))
|
||||
(let ((shift (* 16 (logand imm 7))))
|
||||
(truly-the int-sse-pack
|
||||
(%int-to-sse-pack
|
||||
(logior (logand (%sse-pack-to-int x)
|
||||
(lognot (ash #xFFFF shift)))
|
||||
(ash (logand intv #xFFFF) shift))))))
|
||||
|
||||
;; Shuffle
|
||||
|
||||
(defun shuffle-pi32 (x imm)
|
||||
(declare (type sse-pack x))
|
||||
(let* ((xval (%sse-pack-to-int x)))
|
||||
(truly-the int-sse-pack (%int-to-sse-pack (%shuffle-subints xval xval imm 32)))))
|
||||
|
||||
(defun shufflelo-pi16 (x imm)
|
||||
(declare (type sse-pack x))
|
||||
(let* ((xval (%sse-pack-low x)))
|
||||
(truly-the int-sse-pack (%make-sse-pack (%shuffle-subints xval xval imm 16)
|
||||
(%sse-pack-high x)))))
|
||||
|
||||
(defun shufflehi-pi16 (x imm)
|
||||
(declare (type sse-pack x))
|
||||
(let* ((xval (%sse-pack-high x)))
|
||||
(truly-the int-sse-pack (%make-sse-pack (%sse-pack-low x)
|
||||
(%shuffle-subints xval xval imm 16)))))
|
||||
|
||||
61
contrib/cl-simd/sse-array-defs.lisp
Normal file
61
contrib/cl-simd/sse-array-defs.lisp
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
|
||||
;;;
|
||||
;;; This file contains definitions for vectorized access
|
||||
;;; to specialized lisp arrays.
|
||||
;;;
|
||||
|
||||
(in-package #:SSE)
|
||||
|
||||
;;; Prefetch: AREF-PREFETCH-*, ROW-MAJOR-AREF-PREFETCH-*
|
||||
|
||||
(def-aref-intrinsic #:PREFETCH-T0 nil cpu-prefetch-t0 nil :check-bounds nil)
|
||||
(def-aref-intrinsic #:PREFETCH-T1 nil cpu-prefetch-t1 nil :check-bounds nil)
|
||||
(def-aref-intrinsic #:PREFETCH-T2 nil cpu-prefetch-t2 nil :check-bounds nil)
|
||||
(def-aref-intrinsic #:PREFETCH-NTA nil cpu-prefetch-nta nil :check-bounds nil)
|
||||
|
||||
(def-aref-intrinsic #:CLFLUSH nil cpu-clflush nil :check-bounds :no-gap)
|
||||
|
||||
;;; Single-float
|
||||
|
||||
;; AREF-PS, ROW-MAJOR-AREF-PS
|
||||
|
||||
(def-aref-intrinsic #:PS float-sse-pack mem-ref-ps mem-set-ps)
|
||||
|
||||
;; AREF-APS, ROW-MAJOR-AREF-APS (requires alignment)
|
||||
|
||||
(def-aref-intrinsic #:APS float-sse-pack mem-ref-aps mem-set-aps)
|
||||
|
||||
;; AREF-SPS, ROW-MAJOR-AREF-SPS (requires alignment; no write cache)
|
||||
|
||||
(def-aref-intrinsic #:SPS float-sse-pack mem-ref-aps stream-ps)
|
||||
|
||||
;;; Double-float
|
||||
|
||||
;; AREF-PD, ROW-MAJOR-AREF-PD
|
||||
|
||||
(def-aref-intrinsic #:PD double-sse-pack mem-ref-pd mem-set-pd)
|
||||
|
||||
;; AREF-APD, ROW-MAJOR-AREF-APD (requires alignment)
|
||||
|
||||
(def-aref-intrinsic #:APD double-sse-pack mem-ref-apd mem-set-apd)
|
||||
|
||||
;; AREF-SPD, ROW-MAJOR-AREF-SPD (requires alignment; no write cache)
|
||||
|
||||
(def-aref-intrinsic #:SPD double-sse-pack mem-ref-apd stream-pd)
|
||||
|
||||
;;; Integer
|
||||
|
||||
;; AREF-PI, ROW-MAJOR-AREF-PI
|
||||
|
||||
(def-aref-intrinsic #:PI int-sse-pack mem-ref-pi mem-set-pi)
|
||||
|
||||
;; AREF-API, ROW-MAJOR-AREF-API (requires alignment)
|
||||
|
||||
(def-aref-intrinsic #:API int-sse-pack mem-ref-api mem-set-api)
|
||||
|
||||
;; AREF-SPI, ROW-MAJOR-AREF-SPI (requires alignment; no write cache)
|
||||
|
||||
(def-aref-intrinsic #:SPI int-sse-pack mem-ref-api stream-pi)
|
||||
|
||||
689
contrib/cl-simd/sse-intrinsics.lisp
Normal file
689
contrib/cl-simd/sse-intrinsics.lisp
Normal file
|
|
@ -0,0 +1,689 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
|
||||
;;;
|
||||
;;; This file contains definitions for all SSE intrinsics.
|
||||
;;;
|
||||
;;; The macros are defined in the *-core.lisp files.
|
||||
;;; On SBCL wrapping functions are defined by sbcl-functions.lisp.
|
||||
;;;
|
||||
|
||||
(in-package #:SSE)
|
||||
|
||||
#+(and ecl (or ecl_min stage1 cross))
|
||||
(eval-when (:compile-toplevel)
|
||||
;; During the initial bootstrap sequence when the contribs are
|
||||
;; compiled, the system does not load fasls after building them.
|
||||
;; (For all it knows, it might be cross-compiling to another architecture.)
|
||||
;; Work around by loading the macro definition file into the interpreter:
|
||||
(load (merge-pathnames #P"ecl-sse-core.lisp" *compile-file-truename*)))
|
||||
|
||||
;;; Prefetch
|
||||
|
||||
(def-load-intrinsic cpu-prefetch-t0 nil prefetch "_mm_prefetch" :tags (:t0) :size :byte :postfix-fmt ",_MM_HINT_T0")
|
||||
(def-load-intrinsic cpu-prefetch-t1 nil prefetch "_mm_prefetch" :tags (:t1) :size :byte :postfix-fmt ",_MM_HINT_T1")
|
||||
(def-load-intrinsic cpu-prefetch-t2 nil prefetch "_mm_prefetch" :tags (:t2) :size :byte :postfix-fmt ",_MM_HINT_T2")
|
||||
(def-load-intrinsic cpu-prefetch-nta nil prefetch "_mm_prefetch" :tags (:nta) :size :byte :postfix-fmt ",_MM_HINT_NTA")
|
||||
|
||||
(def-load-intrinsic cpu-clflush nil clflush "_mm_clflush" :size :byte)
|
||||
|
||||
;;; CPU control
|
||||
|
||||
#+sbcl
|
||||
(progn
|
||||
(defknown cpu-mxcsr () (unsigned-byte 32) (flushable))
|
||||
|
||||
(define-vop (cpu-mxcsr)
|
||||
(:translate cpu-mxcsr)
|
||||
(:args) (:arg-types)
|
||||
(:results (result :scs (unsigned-reg)))
|
||||
(:result-types unsigned-num)
|
||||
(:temporary (:sc unsigned-stack) tmp)
|
||||
(:policy :fast-safe)
|
||||
(:generator 3
|
||||
(let ((ea (make-ea :dword :base rbp-tn
|
||||
:disp (frame-byte-offset (tn-offset tmp)))))
|
||||
(inst stmxcsr ea)
|
||||
(inst mov (reg-in-size result :dword) ea))))
|
||||
|
||||
(defknown %set-cpu-mxcsr ((unsigned-byte 32)) (unsigned-byte 32) (unsafe))
|
||||
|
||||
(define-vop (%set-cpu-mxcsr)
|
||||
(:translate %set-cpu-mxcsr)
|
||||
(:args (value :scs (unsigned-reg unsigned-stack) :target result))
|
||||
(:arg-types unsigned-num)
|
||||
(:results (result :scs (unsigned-reg)
|
||||
:load-if (not (and (sc-is result unsigned-stack)
|
||||
(or (sc-is value unsigned-reg)
|
||||
(location= value result))))))
|
||||
(:result-types unsigned-num)
|
||||
(:temporary (:sc unsigned-stack) tmp)
|
||||
(:policy :fast-safe)
|
||||
(:generator 3
|
||||
(cond ((sc-is value unsigned-stack)
|
||||
(setf tmp value))
|
||||
((sc-is result unsigned-stack)
|
||||
(setf tmp result)))
|
||||
(move tmp value)
|
||||
(unless (location= result tmp)
|
||||
(move result value))
|
||||
(let ((ea (make-ea :dword :base rbp-tn
|
||||
:disp (frame-byte-offset (tn-offset tmp)))))
|
||||
(inst ldmxcsr ea))))
|
||||
|
||||
(macrolet ((defvoid (name insn)
|
||||
`(progn
|
||||
(export ',name)
|
||||
(defknown ,name () (values) ())
|
||||
(define-vop (,name)
|
||||
(:translate ,name)
|
||||
(:policy :fast-safe)
|
||||
(:generator 1
|
||||
(inst ,insn))))))
|
||||
(defvoid cpu-load-fence lfence)
|
||||
(defvoid cpu-store-fence sfence)
|
||||
(defvoid cpu-memory-fence mfence)
|
||||
(defvoid cpu-pause pause)))
|
||||
|
||||
#+ecl
|
||||
(progn
|
||||
(def-intrinsic cpu-mxcsr () fixnum "_mm_getcsr")
|
||||
(def-intrinsic %set-cpu-mxcsr (fixnum) fixnum "_mm_setcsr" :export nil :ret-arg 0)
|
||||
|
||||
(defsetf cpu-mxcsr %set-cpu-mxcsr)
|
||||
|
||||
(def-intrinsic cpu-load-fence () nil "_mm_lfence")
|
||||
(def-intrinsic cpu-store-fence () nil "_mm_sfence")
|
||||
(def-intrinsic cpu-memory-fence () nil "_mm_mfence")
|
||||
|
||||
(def-intrinsic cpu-pause () nil "_mm_pause"))
|
||||
|
||||
;;; Single-float
|
||||
|
||||
;; Initialization
|
||||
|
||||
#+sbcl
|
||||
(def-float-set-intrinsic set-ss %set-ss single-float single-reg float-sse-pack movaps)
|
||||
|
||||
#+ecl
|
||||
(progn
|
||||
(def-intrinsic set-ss (single-float) float-sse-pack "_mm_set_ss")
|
||||
(def-intrinsic set1-ps (single-float) float-sse-pack "_mm_set1_ps")
|
||||
|
||||
(def-intrinsic set-ps (single-float single-float single-float single-float) float-sse-pack "_mm_set_ps")
|
||||
(def-intrinsic setr-ps (single-float single-float single-float single-float) float-sse-pack "_mm_setr_ps")
|
||||
|
||||
(def-intrinsic setzero-ps () float-sse-pack "_mm_setzero_ps"))
|
||||
|
||||
;; Memory
|
||||
|
||||
(def-load-intrinsic mem-ref-ss float-sse-pack movss "_mm_load_ss")
|
||||
|
||||
(def-load-intrinsic mem-ref-ps float-sse-pack movups "_mm_loadu_ps")
|
||||
(def-load-intrinsic mem-ref-aps float-sse-pack movaps "_mm_load_ps")
|
||||
|
||||
(def-store-intrinsic mem-set-ss float-sse-pack movss "_mm_store_ss" :setf-name mem-ref-ss)
|
||||
|
||||
(def-store-intrinsic mem-set-ps float-sse-pack movups "_mm_storeu_ps" :setf-name mem-ref-ps)
|
||||
(def-store-intrinsic mem-set-aps float-sse-pack movaps "_mm_store_ps" :setf-name mem-ref-aps)
|
||||
|
||||
(def-store-intrinsic stream-ps float-sse-pack movntps "_mm_stream_ps")
|
||||
|
||||
;; Arithmetics
|
||||
|
||||
(def-binary-intrinsic add-ss float-sse-pack addss 3 "_mm_add_ss")
|
||||
(def-binary-intrinsic add-ps float-sse-pack addps 3 "_mm_add_ps" :commutative t)
|
||||
(def-binary-intrinsic sub-ss float-sse-pack subss 3 "_mm_sub_ss")
|
||||
(def-binary-intrinsic sub-ps float-sse-pack subps 3 "_mm_sub_ps")
|
||||
(def-binary-intrinsic mul-ss float-sse-pack mulss 5 "_mm_mul_ss")
|
||||
(def-binary-intrinsic mul-ps float-sse-pack mulps 5 "_mm_mul_ps" :commutative t)
|
||||
(def-binary-intrinsic div-ss float-sse-pack divss 13 "_mm_div_ss")
|
||||
(def-binary-intrinsic div-ps float-sse-pack divps 13 "_mm_div_ps")
|
||||
(def-binary-intrinsic min-ss float-sse-pack minss 3 "_mm_min_ss")
|
||||
(def-binary-intrinsic min-ps float-sse-pack minps 3 "_mm_min_ps":commutative t)
|
||||
(def-binary-intrinsic max-ss float-sse-pack maxss 3 "_mm_max_ss")
|
||||
(def-binary-intrinsic max-ps float-sse-pack maxps 3 "_mm_max_ps" :commutative t)
|
||||
|
||||
(def-unary-intrinsic sqrt-ss float-sse-pack sqrtss 20 "_mm_sqrt_ss" :partial t)
|
||||
(def-unary-intrinsic sqrt-ps float-sse-pack sqrtps 20 "_mm_sqrt_ps")
|
||||
(def-unary-intrinsic rsqrt-ss float-sse-pack rsqrtss 20 "_mm_rsqrt_ss" :partial t)
|
||||
(def-unary-intrinsic rsqrt-ps float-sse-pack rsqrtps 20 "_mm_rsqrt_ps")
|
||||
(def-unary-intrinsic rcp-ss float-sse-pack rcpss 13 "_mm_rcp_ss" :partial t)
|
||||
(def-unary-intrinsic rcp-ps float-sse-pack rcpps 13 "_mm_rcp_ps")
|
||||
|
||||
;; Bitwise logic
|
||||
|
||||
#+sbcl
|
||||
(def-not-intrinsic not-ps float-sse-pack xorps)
|
||||
|
||||
(def-binary-intrinsic and-ps float-sse-pack andps 1 "_mm_and_ps" :commutative t)
|
||||
(def-binary-intrinsic andnot-ps float-sse-pack andnps 1 "_mm_andnot_ps")
|
||||
(def-binary-intrinsic or-ps float-sse-pack orps 1 "_mm_or_ps" :commutative t)
|
||||
(def-binary-intrinsic xor-ps float-sse-pack xorps 1 "_mm_xor_ps" :commutative t)
|
||||
|
||||
;; Comparisons
|
||||
|
||||
(def-binary-intrinsic =-ss float-sse-pack cmpss 3 "_mm_cmpeq_ss" :tags (:eq))
|
||||
(def-binary-intrinsic =-ps float-sse-pack cmpps 3 "_mm_cmpeq_ps" :tags (:eq) :commutative t)
|
||||
(def-binary-intrinsic <-ss float-sse-pack cmpss 3 "_mm_cmplt_ss" :tags (:lt))
|
||||
(def-binary-intrinsic <-ps float-sse-pack cmpps 3 "_mm_cmplt_ps" :tags (:lt))
|
||||
(def-binary-intrinsic <=-ss float-sse-pack cmpss 3 "_mm_cmple_ss" :tags (:le))
|
||||
(def-binary-intrinsic <=-ps float-sse-pack cmpps 3 "_mm_cmple_ps" :tags (:le))
|
||||
#+ecl
|
||||
(def-binary-intrinsic >-ss float-sse-pack nil nil "_mm_cmpgt_ss")
|
||||
#+ecl
|
||||
(def-binary-intrinsic >-ps float-sse-pack nil nil "_mm_cmpgt_ps")
|
||||
#+ecl
|
||||
(def-binary-intrinsic >=-ss float-sse-pack nil nil "_mm_cmpge_ss")
|
||||
#+ecl
|
||||
(def-binary-intrinsic >=-ps float-sse-pack nil nil "_mm_cmpge_ps")
|
||||
|
||||
(def-binary-intrinsic /=-ss float-sse-pack cmpss 3 "_mm_cmpneq_ss" :tags (:neq))
|
||||
(def-binary-intrinsic /=-ps float-sse-pack cmpps 3 "_mm_cmpneq_ps" :tags (:neq) :commutative t)
|
||||
(def-binary-intrinsic /<-ss float-sse-pack cmpss 3 "_mm_cmpnlt_ss" :tags (:nlt))
|
||||
(def-binary-intrinsic /<-ps float-sse-pack cmpps 3 "_mm_cmpnlt_ps" :tags (:nlt))
|
||||
(def-binary-intrinsic /<=-ss float-sse-pack cmpss 3 "_mm_cmpnle_ss" :tags (:nle))
|
||||
(def-binary-intrinsic /<=-ps float-sse-pack cmpps 3 "_mm_cmpnle_ps" :tags (:nle))
|
||||
#+ecl
|
||||
(def-binary-intrinsic />-ss float-sse-pack nil nil "_mm_cmpngt_ss")
|
||||
#+ecl
|
||||
(def-binary-intrinsic />-ps float-sse-pack nil nil "_mm_cmpngt_ps")
|
||||
#+ecl
|
||||
(def-binary-intrinsic />=-ss float-sse-pack nil nil "_mm_cmpnge_ss")
|
||||
#+ecl
|
||||
(def-binary-intrinsic />=-ps float-sse-pack nil nil "_mm_cmpnge_ps")
|
||||
|
||||
(def-binary-intrinsic cmpord-ss float-sse-pack cmpss 3 "_mm_cmpord_ss" :tags (:ord)) ; neither is NaN
|
||||
(def-binary-intrinsic cmpord-ps float-sse-pack cmpps 3 "_mm_cmpord_ps" :tags (:ord) :commutative t)
|
||||
(def-binary-intrinsic cmpunord-ss float-sse-pack cmpss 3 "_mm_cmpunord_ss" :tags (:unord))
|
||||
(def-binary-intrinsic cmpunord-ps float-sse-pack cmpps 3 "_mm_cmpunord_ps" :tags (:unord) :commutative t)
|
||||
|
||||
#| Skipped: _mm_u?comi.*_ss |#
|
||||
|
||||
;; Misc
|
||||
|
||||
(def-binary-intrinsic unpackhi-ps float-sse-pack unpckhps 1 "_mm_unpackhi_ps")
|
||||
(def-binary-intrinsic unpacklo-ps float-sse-pack unpcklps 1 "_mm_unpacklo_ps")
|
||||
|
||||
(def-binary-intrinsic move-ss float-sse-pack movss 1 "_mm_move_ss")
|
||||
|
||||
(def-binary-intrinsic movehl-ps float-sse-pack movhlps 1 "_mm_movehl_ps")
|
||||
(def-binary-intrinsic movelh-ps float-sse-pack movlhps 1 "_mm_movelh_ps")
|
||||
|
||||
(def-unary-intrinsic movemask-ps (unsigned-byte 4) movmskps 1 "_mm_movemask_ps" :arg-type float-sse-pack)
|
||||
|
||||
;; Shuffle
|
||||
|
||||
(def-binary-intrinsic shuffle-ps float-sse-pack shufps 1 "_mm_shuffle_ps" :immediate-arg (unsigned-byte 8))
|
||||
|
||||
;; Conversion
|
||||
|
||||
(def-unary-intrinsic convert-pi32-to-ps float-sse-pack cvtdq2ps 3 "_mm_cvtepi32_ps" :arg-type int-sse-pack)
|
||||
(def-unary-intrinsic convert-ps-to-pi32 int-sse-pack cvtps2dq 3 "_mm_cvtps_epi32" :arg-type float-sse-pack)
|
||||
(def-unary-intrinsic truncate-ps-to-pi32 int-sse-pack cvttps2dq 3 "_mm_cvttps_epi32" :arg-type float-sse-pack)
|
||||
|
||||
(def-sse-int-intrinsic convert-si32-to-ss (signed-byte 32) float-sse-pack cvtsi2ss 3 "_mm_cvtsi32_ss")
|
||||
(def-cvt-to-int32-intrinsic convert-ss-to-si32 (signed-byte 32) cvtss2si 3 "_mm_cvtss_si32" :arg-type float-sse-pack)
|
||||
(def-cvt-to-int32-intrinsic truncate-ss-to-si32 (signed-byte 32) cvttss2si 3 "_mm_cvttss_si32" :arg-type float-sse-pack)
|
||||
|
||||
#+(or x86_64 x86-64)
|
||||
(def-sse-int-intrinsic convert-si64-to-ss (signed-byte 64) float-sse-pack cvtsi2ss 3
|
||||
#-msvc "_mm_cvtsi64_ss" #+msvc "_mm_cvtsi64x_ss")
|
||||
#+(or x86_64 x86-64)
|
||||
(def-unary-intrinsic convert-ss-to-si64 (signed-byte 64) cvtss2si 3
|
||||
#-msvc "_mm_cvtss_si64" #+msvc "_mm_cvtss_si64x" :arg-type float-sse-pack)
|
||||
#+(or x86_64 x86-64)
|
||||
(def-unary-intrinsic truncate-ss-to-si64 (signed-byte 64) cvttss2si 3
|
||||
#-msvc "_mm_cvttss_si64" #+msvc "_mm_cvttss_si64x" :arg-type float-sse-pack)
|
||||
|
||||
;;; Double-float
|
||||
|
||||
;; Initialization
|
||||
|
||||
#+sbcl
|
||||
(def-float-set-intrinsic set-sd %set-sd double-float double-reg double-sse-pack movapd)
|
||||
|
||||
#+ecl
|
||||
(progn
|
||||
(def-intrinsic set-sd (double-float) double-sse-pack "_mm_set_sd")
|
||||
(def-intrinsic set1-pd (double-float) double-sse-pack "_mm_set1_pd")
|
||||
|
||||
(def-intrinsic set-pd (double-float double-float) double-sse-pack "_mm_set_pd")
|
||||
(def-intrinsic setr-pd (double-float double-float) double-sse-pack "_mm_setr_pd")
|
||||
|
||||
(def-intrinsic setzero-pd () double-sse-pack "_mm_setzero_pd"))
|
||||
|
||||
;; Memory
|
||||
|
||||
(def-load-intrinsic mem-ref-sd double-sse-pack movsd "_mm_load_sd")
|
||||
|
||||
(def-load-intrinsic mem-ref-pd double-sse-pack movupd "_mm_loadu_pd")
|
||||
(def-load-intrinsic mem-ref-apd double-sse-pack movapd "_mm_load_pd")
|
||||
|
||||
(def-load-intrinsic loadh-pd double-sse-pack movhpd "_mm_loadh_pd" :register-arg t)
|
||||
(def-load-intrinsic loadl-pd double-sse-pack movlpd "_mm_loadl_pd" :register-arg t)
|
||||
|
||||
(def-store-intrinsic mem-set-sd double-sse-pack movsd "_mm_store_sd" :setf-name mem-ref-sd)
|
||||
|
||||
(def-store-intrinsic mem-set-pd double-sse-pack movupd "_mm_storeu_pd" :setf-name mem-ref-pd)
|
||||
(def-store-intrinsic mem-set-apd double-sse-pack movapd "_mm_store_pd" :setf-name mem-ref-apd)
|
||||
|
||||
(def-store-intrinsic storeh-pd double-sse-pack movhpd "_mm_storeh_pd")
|
||||
(def-store-intrinsic storel-pd double-sse-pack movlpd "_mm_storel_pd")
|
||||
|
||||
(def-store-intrinsic stream-pd double-sse-pack movntpd "_mm_stream_pd")
|
||||
|
||||
;; Arithmetics
|
||||
|
||||
(def-binary-intrinsic add-sd double-sse-pack addsd 3 "_mm_add_sd")
|
||||
(def-binary-intrinsic add-pd double-sse-pack addpd 3 "_mm_add_pd" :commutative t)
|
||||
(def-binary-intrinsic sub-sd double-sse-pack subsd 3 "_mm_sub_sd")
|
||||
(def-binary-intrinsic sub-pd double-sse-pack subpd 3 "_mm_sub_pd")
|
||||
(def-binary-intrinsic mul-sd double-sse-pack mulsd 5 "_mm_mul_sd")
|
||||
(def-binary-intrinsic mul-pd double-sse-pack mulpd 5 "_mm_mul_pd" :commutative t)
|
||||
(def-binary-intrinsic div-sd double-sse-pack divsd 13 "_mm_div_sd")
|
||||
(def-binary-intrinsic div-pd double-sse-pack divpd 13 "_mm_div_pd")
|
||||
(def-binary-intrinsic min-sd double-sse-pack minsd 3 "_mm_min_sd")
|
||||
(def-binary-intrinsic min-pd double-sse-pack minpd 3 "_mm_min_pd" :commutative t)
|
||||
(def-binary-intrinsic max-sd double-sse-pack maxsd 3 "_mm_max_sd")
|
||||
(def-binary-intrinsic max-pd double-sse-pack maxpd 3 "_mm_max_pd" :commutative t)
|
||||
|
||||
(def-binary-intrinsic sqrt-sd double-sse-pack sqrtsd 20 "_mm_sqrt_sd")
|
||||
(def-unary-intrinsic sqrt-pd double-sse-pack sqrtpd 20 "_mm_sqrt_pd")
|
||||
|
||||
;; Bitwise logic
|
||||
|
||||
#+sbcl
|
||||
(def-not-intrinsic not-pd double-sse-pack xorpd)
|
||||
|
||||
(def-binary-intrinsic and-pd double-sse-pack andpd 1 "_mm_and_pd" :commutative t)
|
||||
(def-binary-intrinsic andnot-pd double-sse-pack andnpd 1 "_mm_andnot_pd")
|
||||
(def-binary-intrinsic or-pd double-sse-pack orpd 1 "_mm_or_pd" :commutative t)
|
||||
(def-binary-intrinsic xor-pd double-sse-pack xorpd 1 "_mm_xor_pd" :commutative t)
|
||||
|
||||
;; Comparisons
|
||||
|
||||
(def-binary-intrinsic =-sd double-sse-pack cmpsd 3 "_mm_cmpeq_sd" :tags (:eq))
|
||||
(def-binary-intrinsic =-pd double-sse-pack cmppd 3 "_mm_cmpeq_pd" :tags (:eq) :commutative t)
|
||||
(def-binary-intrinsic <-sd double-sse-pack cmpsd 3 "_mm_cmplt_sd" :tags (:lt))
|
||||
(def-binary-intrinsic <-pd double-sse-pack cmppd 3 "_mm_cmplt_pd" :tags (:lt))
|
||||
(def-binary-intrinsic <=-sd double-sse-pack cmpsd 3 "_mm_cmple_sd" :tags (:le))
|
||||
(def-binary-intrinsic <=-pd double-sse-pack cmppd 3 "_mm_cmple_pd" :tags (:le))
|
||||
#+ecl
|
||||
(def-binary-intrinsic >-sd double-sse-pack nil nil "_mm_cmpgt_sd")
|
||||
#+ecl
|
||||
(def-binary-intrinsic >-pd double-sse-pack nil nil "_mm_cmpgt_pd")
|
||||
#+ecl
|
||||
(def-binary-intrinsic >=-sd double-sse-pack nil nil "_mm_cmpge_sd")
|
||||
#+ecl
|
||||
(def-binary-intrinsic >=-pd double-sse-pack nil nil "_mm_cmpge_pd")
|
||||
|
||||
(def-binary-intrinsic /=-sd double-sse-pack cmpsd 3 "_mm_cmpneq_sd" :tags (:neq))
|
||||
(def-binary-intrinsic /=-pd double-sse-pack cmppd 3 "_mm_cmpneq_pd" :tags (:neq) :commutative t)
|
||||
(def-binary-intrinsic /<-sd double-sse-pack cmpsd 3 "_mm_cmpnlt_sd" :tags (:nlt))
|
||||
(def-binary-intrinsic /<-pd double-sse-pack cmppd 3 "_mm_cmpnlt_pd" :tags (:nlt))
|
||||
(def-binary-intrinsic /<=-sd double-sse-pack cmpsd 3 "_mm_cmpnle_sd" :tags (:nle))
|
||||
(def-binary-intrinsic /<=-pd double-sse-pack cmppd 3 "_mm_cmpnle_pd" :tags (:nle))
|
||||
#+ecl
|
||||
(def-binary-intrinsic />-sd double-sse-pack nil nil "_mm_cmpngt_sd")
|
||||
#+ecl
|
||||
(def-binary-intrinsic />-pd double-sse-pack nil nil "_mm_cmpngt_pd")
|
||||
#+ecl
|
||||
(def-binary-intrinsic />=-sd double-sse-pack nil nil "_mm_cmpnge_sd")
|
||||
#+ecl
|
||||
(def-binary-intrinsic />=-pd double-sse-pack nil nil "_mm_cmpnge_pd")
|
||||
|
||||
(def-binary-intrinsic cmpord-sd double-sse-pack cmpsd 3 "_mm_cmpord_sd" :tags (:ord)) ; neither is NaN
|
||||
(def-binary-intrinsic cmpord-pd double-sse-pack cmppd 3 "_mm_cmpord_pd" :tags (:ord) :commutative t)
|
||||
(def-binary-intrinsic cmpunord-sd double-sse-pack cmpsd 3 "_mm_cmpunord_sd" :tags (:unord))
|
||||
(def-binary-intrinsic cmpunord-pd double-sse-pack cmppd 3 "_mm_cmpunord_pd" :tags (:unord) :commutative t)
|
||||
|
||||
;; Misc
|
||||
|
||||
(def-binary-intrinsic unpackhi-pd double-sse-pack unpckhpd 1 "_mm_unpackhi_pd")
|
||||
(def-binary-intrinsic unpacklo-pd double-sse-pack unpcklpd 1 "_mm_unpacklo_pd")
|
||||
|
||||
(def-binary-intrinsic move-sd double-sse-pack movsd 1 "_mm_move_sd")
|
||||
|
||||
(def-unary-intrinsic movemask-pd (unsigned-byte 2) movmskpd 1 "_mm_movemask_pd" :arg-type double-sse-pack)
|
||||
|
||||
;; Shuffle
|
||||
|
||||
(def-binary-intrinsic shuffle-pd double-sse-pack shufpd 1 "_mm_shuffle_pd" :immediate-arg (unsigned-byte 2))
|
||||
|
||||
;; Conversion
|
||||
|
||||
(def-unary-intrinsic convert-ps-to-pd double-sse-pack cvtps2pd 3 "_mm_cvtps_pd" :arg-type float-sse-pack)
|
||||
(def-unary-intrinsic convert-pd-to-ps float-sse-pack cvtpd2ps 3 "_mm_cvtpd_ps" :arg-type double-sse-pack)
|
||||
|
||||
(def-binary-intrinsic convert-ss-to-sd double-sse-pack cvtss2sd 3 "_mm_cvtss_sd" :y-type float-sse-pack)
|
||||
(def-binary-intrinsic convert-sd-to-ss float-sse-pack cvtsd2ss 3 "_mm_cvtsd_ss" :y-type double-sse-pack)
|
||||
|
||||
(def-unary-intrinsic convert-pi32-to-pd double-sse-pack cvtdq2pd 3 "_mm_cvtepi32_pd" :arg-type int-sse-pack)
|
||||
(def-unary-intrinsic convert-pd-to-pi32 int-sse-pack cvtpd2dq 3 "_mm_cvtpd_epi32" :arg-type double-sse-pack)
|
||||
(def-unary-intrinsic truncate-pd-to-pi32 int-sse-pack cvttpd2dq 3 "_mm_cvttpd_epi32" :arg-type double-sse-pack)
|
||||
|
||||
(def-sse-int-intrinsic convert-si32-to-sd (signed-byte 32) double-sse-pack cvtsi2ss 3 "_mm_cvtsi32_sd")
|
||||
(def-cvt-to-int32-intrinsic convert-sd-to-si32 (signed-byte 32) cvtsd2si 3 "_mm_cvtsd_si32" :arg-type double-sse-pack)
|
||||
(def-cvt-to-int32-intrinsic truncate-sd-to-si32 (signed-byte 32) cvttsd2si 3 "_mm_cvttsd_si32" :arg-type double-sse-pack)
|
||||
|
||||
#+(or x86_64 x86-64)
|
||||
(def-sse-int-intrinsic convert-si64-to-sd (signed-byte 64) double-sse-pack cvtsi2ss 3
|
||||
#-msvc "_mm_cvtsi64_sd" #+msvc "_mm_cvtsi64x_sd")
|
||||
#+(or x86_64 x86-64)
|
||||
(def-unary-intrinsic convert-sd-to-si64 (signed-byte 64) cvtsd2si 3
|
||||
#-msvc "_mm_cvtsd_si64" #+msvc "_mm_cvtsd_si64x" :arg-type double-sse-pack)
|
||||
#+(or x86_64 x86-64)
|
||||
(def-unary-intrinsic truncate-sd-to-si64 (signed-byte 64) cvttsd2si 3
|
||||
#-msvc "_mm_cvttsd_si64" #+msvc "_mm_cvttsd_si64x" :arg-type double-sse-pack)
|
||||
|
||||
;;; Integer
|
||||
|
||||
;; Initialization
|
||||
|
||||
#+ecl
|
||||
(progn
|
||||
(def-intrinsic set1-pi8 (fixnum) int-sse-pack "_mm_set1_epi8")
|
||||
(def-intrinsic set1-pi16 (fixnum) int-sse-pack "_mm_set1_epi16")
|
||||
(def-intrinsic set1-pi32 (ext:integer32) int-sse-pack "_mm_set1_epi32")
|
||||
#+x86_64
|
||||
(def-intrinsic set1-pi64 (ext:integer64) int-sse-pack "_mm_set1_epi64x")
|
||||
|
||||
(def-intrinsic set1-pu32 (ext:byte32) int-sse-pack "_mm_set1_epi32")
|
||||
#+x86_64
|
||||
(def-intrinsic set1-pu64 (ext:byte64) int-sse-pack "_mm_set1_epi64x")
|
||||
|
||||
;;-----
|
||||
(def-intrinsic set-pi8 (fixnum fixnum fixnum fixnum
|
||||
fixnum fixnum fixnum fixnum
|
||||
fixnum fixnum fixnum fixnum
|
||||
fixnum fixnum fixnum fixnum) int-sse-pack "_mm_set_epi8")
|
||||
(def-intrinsic set-pi16 (fixnum fixnum fixnum fixnum
|
||||
fixnum fixnum fixnum fixnum) int-sse-pack "_mm_set_epi16")
|
||||
(def-intrinsic set-pi32 (ext:integer32 ext:integer32 ext:integer32 ext:integer32) int-sse-pack "_mm_set_epi32")
|
||||
#+x86_64
|
||||
(def-intrinsic set-pi64 (ext:integer64 ext:integer64) int-sse-pack "_mm_set_epi64x")
|
||||
|
||||
(def-intrinsic set-pu32 (ext:byte32 ext:byte32 ext:byte32 ext:byte32) int-sse-pack "_mm_set_epi32")
|
||||
#+x86_64
|
||||
(def-intrinsic set-pu64 (ext:byte64 ext:byte64) int-sse-pack "_mm_set_epi64x")
|
||||
|
||||
;;-----
|
||||
(def-intrinsic setr-pi8 (fixnum fixnum fixnum fixnum
|
||||
fixnum fixnum fixnum fixnum
|
||||
fixnum fixnum fixnum fixnum
|
||||
fixnum fixnum fixnum fixnum) int-sse-pack "_mm_setr_epi8")
|
||||
(def-intrinsic setr-pi16 (fixnum fixnum fixnum fixnum
|
||||
fixnum fixnum fixnum fixnum) int-sse-pack "_mm_setr_epi16")
|
||||
(def-intrinsic setr-pi32 (ext:integer32 ext:integer32 ext:integer32 ext:integer32) int-sse-pack "_mm_setr_epi32")
|
||||
#+x86_64
|
||||
(def-intrinsic setr-pi64 (ext:integer64 ext:integer64) int-sse-pack "_mm_set_epi64x" :reorder-args t)
|
||||
|
||||
(def-intrinsic setr-pu32 (ext:byte32 ext:byte32 ext:byte32 ext:byte32) int-sse-pack "_mm_setr_epi32")
|
||||
#+x86_64
|
||||
(def-intrinsic setr-pu64 (ext:byte64 ext:byte64) int-sse-pack "_mm_set_epi64x" :reorder-args t)
|
||||
|
||||
;;-----
|
||||
(def-intrinsic setzero-pi () int-sse-pack "_mm_setzero_si128"))
|
||||
|
||||
;; Memory
|
||||
|
||||
(def-load-intrinsic mem-ref-pi int-sse-pack movdqu "_mm_loadu_si128")
|
||||
(def-load-intrinsic mem-ref-api int-sse-pack movdqa "_mm_load_si128")
|
||||
|
||||
(def-load-intrinsic mem-ref-si64 int-sse-pack movd "_mm_loadl_epi64")
|
||||
|
||||
(def-store-intrinsic mem-set-pi int-sse-pack movdqu "_mm_storeu_si128" :setf-name mem-ref-pi)
|
||||
(def-store-intrinsic mem-set-api int-sse-pack movdqa "_mm_store_si128" :setf-name mem-ref-api)
|
||||
|
||||
(def-store-intrinsic mem-set-si64 int-sse-pack movd "_mm_storel_epi64" :setf-name mem-ref-si64)
|
||||
|
||||
(def-store-intrinsic stream-pi int-sse-pack movntdq "_mm_stream_si128")
|
||||
|
||||
;; Masked move
|
||||
|
||||
#+ecl
|
||||
(def-mem-intrinsic maskmoveu-pi "char" nil "_mm_maskmoveu_si128" :prefix-args (int-sse-pack int-sse-pack))
|
||||
|
||||
#+sbcl
|
||||
(progn
|
||||
(defknown %maskmoveu-pi (sse-pack sse-pack system-area-pointer fixnum) (values) (unsafe))
|
||||
|
||||
(define-vop (%maskmoveu-pi)
|
||||
(:translate %maskmoveu-pi)
|
||||
(:args (value :scs (sse-reg))
|
||||
(mask :scs (sse-reg))
|
||||
(sap :scs (sap-reg) :target rdi)
|
||||
(offset :scs (signed-reg)))
|
||||
(:arg-types sse-pack sse-pack system-area-pointer signed-num)
|
||||
(:temporary (:sc sap-reg :offset rdi-offset :from :eval) rdi)
|
||||
(:policy :fast-safe)
|
||||
(:note "inline MASKMOVEU operation")
|
||||
(:generator 5
|
||||
(if (location= sap rdi)
|
||||
(inst add rdi offset)
|
||||
(inst lea rdi (make-ea :qword :base sap :index offset)))
|
||||
(inst maskmovdqu value mask)))
|
||||
|
||||
(define-vop (%maskmoveu-pi-c)
|
||||
(:translate %maskmoveu-pi)
|
||||
(:args (value :scs (sse-reg))
|
||||
(mask :scs (sse-reg))
|
||||
(sap :scs (sap-reg) :target rdi))
|
||||
(:arg-types sse-pack sse-pack system-area-pointer (:constant (signed-byte 32)))
|
||||
(:info offset)
|
||||
(:temporary (:sc sap-reg :offset rdi-offset :from :eval) rdi)
|
||||
(:policy :fast-safe)
|
||||
(:note "inline MASKMOVEU operation")
|
||||
(:generator 4
|
||||
(if (location= sap rdi)
|
||||
(unless (= offset 0)
|
||||
(inst add rdi offset))
|
||||
(if (= offset 0)
|
||||
(inst mov rdi sap)
|
||||
(inst lea rdi (make-ea :qword :base sap :disp offset))))
|
||||
(inst maskmovdqu value mask)))
|
||||
|
||||
(def-splice-transform %maskmoveu-pi (value mask (sap+ sap offset1) offset2)
|
||||
(%maskmoveu-pi value mask sap (+ offset1 offset2))))
|
||||
|
||||
;; Arithmetics
|
||||
|
||||
(def-binary-intrinsic add-pi8 int-sse-pack paddb 1 "_mm_add_epi8" :commutative t)
|
||||
(def-binary-intrinsic add-pi16 int-sse-pack paddw 1 "_mm_add_epi16" :commutative t)
|
||||
(def-binary-intrinsic add-pi32 int-sse-pack paddd 1 "_mm_add_epi32" :commutative t)
|
||||
(def-binary-intrinsic add-pi64 int-sse-pack paddq 1 "_mm_add_epi64" :commutative t)
|
||||
|
||||
(def-binary-intrinsic adds-pi8 int-sse-pack paddsb 1 "_mm_adds_epi8" :commutative t)
|
||||
(def-binary-intrinsic adds-pi16 int-sse-pack paddsw 1 "_mm_adds_epi16" :commutative t)
|
||||
(def-binary-intrinsic adds-pu8 int-sse-pack paddusb 1 "_mm_adds_epu8" :commutative t)
|
||||
(def-binary-intrinsic adds-pu16 int-sse-pack paddusw 1 "_mm_adds_epu16" :commutative t)
|
||||
|
||||
(def-binary-intrinsic avg-pu8 int-sse-pack pavgb 1 "_mm_avg_epu8" :commutative t)
|
||||
(def-binary-intrinsic avg-pu16 int-sse-pack pavgw 1 "_mm_avg_epu16" :commutative t)
|
||||
|
||||
(def-binary-intrinsic madd-pi16 int-sse-pack pmaddwd 1 "_mm_madd_epi16" :commutative t)
|
||||
|
||||
(def-binary-intrinsic max-pu8 int-sse-pack pmaxub 1 "_mm_max_epu8" :commutative t)
|
||||
(def-binary-intrinsic max-pi16 int-sse-pack pmaxsw 1 "_mm_max_epi16" :commutative t)
|
||||
(def-binary-intrinsic min-pu8 int-sse-pack pminub 1 "_mm_min_epu8" :commutative t)
|
||||
(def-binary-intrinsic min-pi16 int-sse-pack pminsw 1 "_mm_min_epi16" :commutative t)
|
||||
|
||||
(def-binary-intrinsic mulhi-pi16 int-sse-pack pmulhw 3 "_mm_mulhi_epi16" :commutative t)
|
||||
(def-binary-intrinsic mulhi-pu16 int-sse-pack pmulhuw 3 "_mm_mulhi_epu16" :commutative t)
|
||||
(def-binary-intrinsic mullo-pi16 int-sse-pack pmullw 3 "_mm_mullo_epi16" :commutative t)
|
||||
|
||||
(def-binary-intrinsic mul-pu32 int-sse-pack pmuludq 3 "_mm_mul_epu32" :commutative t)
|
||||
|
||||
(def-binary-intrinsic sad-pu8 int-sse-pack psadbw 1 "_mm_sad_epu8" :commutative t)
|
||||
|
||||
(def-binary-intrinsic sub-pi8 int-sse-pack psubb 1 "_mm_sub_epi8")
|
||||
(def-binary-intrinsic sub-pi16 int-sse-pack psubw 1 "_mm_sub_epi16")
|
||||
(def-binary-intrinsic sub-pi32 int-sse-pack psubd 1 "_mm_sub_epi32")
|
||||
(def-binary-intrinsic sub-pi64 int-sse-pack psubq 1 "_mm_sub_epi64")
|
||||
|
||||
(def-binary-intrinsic subs-pi8 int-sse-pack psubsb 1 "_mm_subs_epi8")
|
||||
(def-binary-intrinsic subs-pi16 int-sse-pack psubsw 1 "_mm_subs_epi16")
|
||||
(def-binary-intrinsic subs-pu8 int-sse-pack psubusb 1 "_mm_subs_epu8")
|
||||
(def-binary-intrinsic subs-pu16 int-sse-pack psubusw 1 "_mm_subs_epu16")
|
||||
|
||||
;; Bitwise logic
|
||||
|
||||
#+sbcl
|
||||
(def-not-intrinsic not-pi int-sse-pack pxor)
|
||||
|
||||
(def-binary-intrinsic and-pi int-sse-pack pand 1 "_mm_and_si128" :commutative t)
|
||||
(def-binary-intrinsic andnot-pi int-sse-pack pandn 1 "_mm_andnot_si128")
|
||||
(def-binary-intrinsic or-pi int-sse-pack por 1 "_mm_or_si128" :commutative t)
|
||||
(def-binary-intrinsic xor-pi int-sse-pack pxor 1 "_mm_xor_si128" :commutative t)
|
||||
|
||||
;; Shifts
|
||||
|
||||
(def-unary-intrinsic slli-pi int-sse-pack pslldq 1 "_mm_slli_si128" :partial :one-arg :immediate-arg (unsigned-byte 8))
|
||||
|
||||
(def-sse-int-intrinsic slli-pi16 fixnum int-sse-pack psllw 3 "_mm_slli_epi16" :make-temporary t)
|
||||
(def-sse-int-intrinsic slli-pi32 fixnum int-sse-pack pslld 3 "_mm_slli_epi32" :make-temporary t)
|
||||
(def-sse-int-intrinsic slli-pi64 fixnum int-sse-pack psllq 3 "_mm_slli_epi64" :make-temporary t)
|
||||
(def-binary-intrinsic sll-pi16 int-sse-pack psllw 1 "_mm_sll_epi16")
|
||||
(def-binary-intrinsic sll-pi32 int-sse-pack pslld 1 "_mm_sll_epi32")
|
||||
(def-binary-intrinsic sll-pi64 int-sse-pack psllq 1 "_mm_sll_epi64")
|
||||
|
||||
(def-sse-int-intrinsic srai-pi16 fixnum int-sse-pack psraw 3 "_mm_srai_epi16" :make-temporary t)
|
||||
(def-sse-int-intrinsic srai-pi32 fixnum int-sse-pack psrad 3 "_mm_srai_epi32" :make-temporary t)
|
||||
(def-binary-intrinsic sra-pi16 int-sse-pack psraw 1 "_mm_sra_epi16")
|
||||
(def-binary-intrinsic sra-pi32 int-sse-pack psrad 1 "_mm_sra_epi32")
|
||||
|
||||
(def-unary-intrinsic srli-pi int-sse-pack psrldq 1 "_mm_srli_si128" :partial :one-arg :immediate-arg (unsigned-byte 8))
|
||||
|
||||
(def-sse-int-intrinsic srli-pi16 fixnum int-sse-pack psrlw 3 "_mm_srli_epi16" :make-temporary t)
|
||||
(def-sse-int-intrinsic srli-pi32 fixnum int-sse-pack psrld 3 "_mm_srli_epi32" :make-temporary t)
|
||||
(def-sse-int-intrinsic srli-pi64 fixnum int-sse-pack psrlq 3 "_mm_srli_epi64" :make-temporary t)
|
||||
(def-binary-intrinsic srl-pi16 int-sse-pack psrlw 1 "_mm_srl_epi16")
|
||||
(def-binary-intrinsic srl-pi32 int-sse-pack psrld 1 "_mm_srl_epi32")
|
||||
(def-binary-intrinsic srl-pi64 int-sse-pack psrlq 1 "_mm_srl_epi64")
|
||||
|
||||
#+sbcl
|
||||
(macrolet ((defimm (name insn bits &key arithmetic)
|
||||
`(define-vop (,(symbolicate "%" name "-IMM") sse-int-base-op)
|
||||
(:translate ,name)
|
||||
(:args (x :scs (sse-reg) :target r))
|
||||
(:arg-types sse-pack (:constant fixnum))
|
||||
(:result-types sb-kernel:int-sse-pack)
|
||||
(:info immv)
|
||||
(:generator 1
|
||||
,@(let ((core `(progn
|
||||
(ensure-move int-sse-pack r x)
|
||||
(unless (= immv 0)
|
||||
(inst ,insn r immv)))))
|
||||
(if arithmetic
|
||||
`((when (or (< immv 0) (>= immv ,bits))
|
||||
(setf immv ,bits))
|
||||
,core)
|
||||
`((if (or (< immv 0) (>= immv ,bits))
|
||||
(inst pxor r r)
|
||||
,core))))))))
|
||||
(defimm slli-pi16 psllw-imm 16)
|
||||
(defimm slli-pi32 pslld-imm 32)
|
||||
(defimm slli-pi64 psllq-imm 64)
|
||||
(defimm srai-pi16 psraw-imm 16 :arithmetic t)
|
||||
(defimm srai-pi32 psrad-imm 32 :arithmetic t)
|
||||
(defimm srli-pi16 psrlw-imm 16)
|
||||
(defimm srli-pi32 psrld-imm 32)
|
||||
(defimm srli-pi64 psrlq-imm 64))
|
||||
|
||||
;; Comparisons
|
||||
|
||||
(def-binary-intrinsic =-pi8 int-sse-pack pcmpeqb 1 "_mm_cmpeq_epi8")
|
||||
(def-binary-intrinsic =-pi16 int-sse-pack pcmpeqw 1 "_mm_cmpeq_epi16")
|
||||
(def-binary-intrinsic =-pi32 int-sse-pack pcmpeqd 1 "_mm_cmpeq_epi32")
|
||||
|
||||
#+ecl
|
||||
(def-binary-intrinsic <-pi8 int-sse-pack nil nil "_mm_cmplt_epi8")
|
||||
#+ecl
|
||||
(def-binary-intrinsic <-pi16 int-sse-pack nil nil "_mm_cmplt_epi16")
|
||||
#+ecl
|
||||
(def-binary-intrinsic <-pi32 int-sse-pack nil nil "_mm_cmplt_epi32")
|
||||
|
||||
(def-binary-intrinsic >-pi8 int-sse-pack pcmpgtb 1 "_mm_cmpgt_epi8")
|
||||
(def-binary-intrinsic >-pi16 int-sse-pack pcmpgtw 1 "_mm_cmpgt_epi16")
|
||||
(def-binary-intrinsic >-pi32 int-sse-pack pcmpgtd 1 "_mm_cmpgt_epi32")
|
||||
|
||||
;; Misc
|
||||
|
||||
(def-binary-intrinsic packs-pi16 int-sse-pack packsswb 1 "_mm_packs_epi16")
|
||||
(def-binary-intrinsic packs-pi32 int-sse-pack packssdw 1 "_mm_packs_epi32")
|
||||
(def-binary-intrinsic packus-pi16 int-sse-pack packuswb 1 "_mm_packus_epi16")
|
||||
|
||||
(def-unary-intrinsic extract-pi16 (unsigned-byte 16) pextrw 1 "_mm_extract_epi16"
|
||||
:immediate-arg (unsigned-byte 8) :arg-type int-sse-pack)
|
||||
(def-sse-int-intrinsic insert-pi16 fixnum int-sse-pack pinsrw 1 "_mm_insert_epi16"
|
||||
:immediate-arg (unsigned-byte 8))
|
||||
|
||||
(def-unary-intrinsic movemask-pi8 (unsigned-byte 16) pmovmskb 1 "_mm_movemask_epi8" :arg-type int-sse-pack)
|
||||
|
||||
(def-binary-intrinsic unpackhi-pi8 int-sse-pack punpckhbw 1 "_mm_unpackhi_epi8")
|
||||
(def-binary-intrinsic unpackhi-pi16 int-sse-pack punpckhwd 1 "_mm_unpackhi_epi16")
|
||||
(def-binary-intrinsic unpackhi-pi32 int-sse-pack punpckhdq 1 "_mm_unpackhi_epi32")
|
||||
(def-binary-intrinsic unpackhi-pi64 int-sse-pack punpckhqdq 1 "_mm_unpackhi_epi64")
|
||||
|
||||
(def-binary-intrinsic unpacklo-pi8 int-sse-pack punpcklbw 1 "_mm_unpacklo_epi8")
|
||||
(def-binary-intrinsic unpacklo-pi16 int-sse-pack punpcklwd 1 "_mm_unpacklo_epi16")
|
||||
(def-binary-intrinsic unpacklo-pi32 int-sse-pack punpckldq 1 "_mm_unpacklo_epi32")
|
||||
(def-binary-intrinsic unpacklo-pi64 int-sse-pack punpcklqdq 1 "_mm_unpacklo_epi64")
|
||||
|
||||
(def-unary-intrinsic move-pi64 int-sse-pack movq 1 "_mm_move_epi64")
|
||||
|
||||
;; Shuffle
|
||||
|
||||
(def-unary-intrinsic shuffle-pi32 int-sse-pack pshufd 1 "_mm_shuffle_epi32" :immediate-arg (unsigned-byte 8))
|
||||
(def-unary-intrinsic shufflelo-pi16 int-sse-pack pshuflw 1 "_mm_shufflelo_epi16" :immediate-arg (unsigned-byte 8))
|
||||
(def-unary-intrinsic shufflehi-pi16 int-sse-pack pshufhw 1 "_mm_shufflehi_epi16" :immediate-arg (unsigned-byte 8))
|
||||
|
||||
;; Conversion
|
||||
|
||||
#+sbcl
|
||||
(progn
|
||||
(export 'convert-si32-to-pi)
|
||||
(defknown convert-si32-to-pi ((signed-byte 32)) int-sse-pack (foldable flushable))
|
||||
(export 'convert-su32-to-pi)
|
||||
(defknown convert-su32-to-pi ((unsigned-byte 32)) int-sse-pack (foldable flushable))
|
||||
(export 'convert-si64-to-pi)
|
||||
(defknown convert-si64-to-pi ((signed-byte 64)) int-sse-pack (foldable flushable))
|
||||
(export 'convert-su64-to-pi)
|
||||
(defknown convert-su64-to-pi ((unsigned-byte 64)) int-sse-pack (foldable flushable))
|
||||
(defknown %set-int ((signed-byte 64)) int-sse-pack (foldable flushable always-translatable))
|
||||
(defknown %set-uint ((unsigned-byte 64)) int-sse-pack (foldable flushable always-translatable))
|
||||
|
||||
(define-vop (%set-int)
|
||||
(:translate %set-int %set-uint
|
||||
convert-si32-to-pi convert-su32-to-pi
|
||||
convert-si64-to-pi convert-su64-to-pi)
|
||||
(:args (arg :scs (signed-reg unsigned-reg signed-stack unsigned-stack)))
|
||||
(:arg-types untagged-num)
|
||||
(:results (dst :scs (sse-reg)))
|
||||
(:result-types sb-kernel:int-sse-pack)
|
||||
(:policy :fast-safe)
|
||||
(:generator 1
|
||||
(inst movd dst arg))))
|
||||
|
||||
#+ecl
|
||||
(progn
|
||||
(def-intrinsic convert-si32-to-pi (ext:integer32) int-sse-pack "_mm_cvtsi32_si128")
|
||||
(def-intrinsic convert-su32-to-pi (ext:byte32) int-sse-pack "_mm_cvtsi32_si128")
|
||||
#+x86_64
|
||||
(def-intrinsic convert-si64-to-pi (ext:integer64) int-sse-pack #-msvc "_mm_cvtsi64_si128" #+msvc "_mm_cvtsi64x_si128")
|
||||
#+x86_64
|
||||
(def-intrinsic convert-su64-to-pi (ext:byte64) int-sse-pack #-msvc "_mm_cvtsi64_si128" #+msvc "_mm_cvtsi64x_si128"))
|
||||
|
||||
(def-cvt-to-int32-intrinsic convert-pi-to-si32 (signed-byte 32) movd 1 "_mm_cvtsi128_si32"
|
||||
:arg-type int-sse-pack)
|
||||
(def-unary-intrinsic convert-pi-to-su32 (unsigned-byte 32) movd 1 "_mm_cvtsi128_si32"
|
||||
:result-size :dword :arg-type int-sse-pack)
|
||||
|
||||
#+(or x86_64 x86-64)
|
||||
(def-unary-intrinsic convert-pi-to-si64 (signed-byte 64) movd 1
|
||||
#-msvc "_mm_cvtsi128_si64" #+msvc "_mm_cvtsi128_si64x" :arg-type int-sse-pack)
|
||||
#+(or x86_64 x86-64)
|
||||
(def-unary-intrinsic convert-pi-to-su64 (unsigned-byte 64) movd 1
|
||||
#-msvc "_mm_cvtsi128_si64" #+msvc "_mm_cvtsi128_si64x" :arg-type int-sse-pack)
|
||||
|
||||
61
contrib/cl-simd/sse-package.lisp
Normal file
61
contrib/cl-simd/sse-package.lisp
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
|
||||
;;;
|
||||
;;; This file defines a package for all SSE intrinsics.
|
||||
;;;
|
||||
|
||||
#+ecl
|
||||
(eval-when (:load-toplevel)
|
||||
(require 'cmp))
|
||||
|
||||
#+sbcl
|
||||
(pushnew :SSE2 *features*)
|
||||
|
||||
(defpackage #:SSE
|
||||
#+sbcl
|
||||
(:use #:COMMON-LISP #:SB-C #:SB-VM #:SB-INT #:SB-KERNEL #:SB-ASSEM #:SB-EXT #:SB-SYS)
|
||||
#+sbcl
|
||||
(:import-from #:SB-VM
|
||||
#:SINGLE-REG #:DOUBLE-REG #:SSE-REG #:SSE-PACK-IMMEDIATE
|
||||
#:SIGNED-REG #:SIGNED-STACK #:UNSIGNED-REG #:UNSIGNED-STACK
|
||||
#:SIGNED-NUM #:UNSIGNED-NUM #:UNTAGGED-NUM #:IMMEDIATE
|
||||
#:SAP-REG #:DESCRIPTOR-REG #:ANY-REG #:TAGGED-NUM
|
||||
#:RAX-OFFSET #:RDI-OFFSET #:RBP-TN #:FRAME-BYTE-OFFSET
|
||||
#:MAKE-EA #:REG-IN-SIZE #:LOADW)
|
||||
#+sbcl
|
||||
(:import-from #:SB-C
|
||||
#:SPLICE-FUN-ARGS #:EXTRACT-FUN-ARGS
|
||||
#:%DEFTRANSFORM #:COMMUTATIVE-ARG-SWAP
|
||||
#:GIVE-UP-IR1-TRANSFORM #:ABORT-IR1-TRANSFORM
|
||||
#:INSERT-ARRAY-BOUNDS-CHECKS #:VECTOR-LENGTH
|
||||
#:ASSERT-ARRAY-RANK #:ASSERT-LVAR-TYPE
|
||||
#:CONSTANT-LVAR-P #:LVAR-VALUE #:LVAR-TYPE
|
||||
#:LEXENV-POLICY #:NODE-LEXENV
|
||||
#:FIND-SAETP #:FIND-SAETP-BY-CTYPE)
|
||||
#+sbcl
|
||||
(:import-from #:SB-IMPL
|
||||
#:%ARRAY-ROW-MAJOR-INDEX)
|
||||
#+sbcl
|
||||
(:shadow #:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK)
|
||||
#+ecl
|
||||
(:use #:COMMON-LISP #:FFI)
|
||||
#+ecl
|
||||
(:import-from #:EXT
|
||||
#:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK
|
||||
#:SSE-PACK-P #:ARRAY-ELEMENT-TYPE-BYTE-SIZE
|
||||
#:*SSE-PACK-PRINT-MODE*)
|
||||
#+ecl
|
||||
(:shadow #:SSE-PACK)
|
||||
;; Common exports:
|
||||
(:export #:SSE-PACK #:SSE-PACK-P
|
||||
#:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK
|
||||
#:*SSE-PACK-PRINT-MODE*
|
||||
#:SSE-ARRAY #:MAKE-SSE-ARRAY
|
||||
#:0.0-PS #:TRUE-SS #:FALSE-SS #:TRUE-PS #:FALSE-PS
|
||||
#:SET1-PS #:SET-PS #:SETR-PS #:SETZERO-PS
|
||||
#:0.0-PD #:TRUE-SD #:FALSE-SD #:TRUE-PD #:FALSE-PD
|
||||
#:SET1-PD #:SET-PD #:SETR-PD #:SETZERO-PD
|
||||
#:0-PI #:TRUE-PI #:FALSE-PI #:SETZERO-PI
|
||||
#:CPU-MXCSR #:CPU-MXCSR-BITS #:WITH-SAVED-MXCSR #:CPU-CONFIGURE-ROUNDING))
|
||||
|
||||
128
contrib/cl-simd/sse-utils.lisp
Normal file
128
contrib/cl-simd/sse-utils.lisp
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
|
||||
;;;
|
||||
;;; This file implements some common utility functions.
|
||||
;;;
|
||||
|
||||
(in-package #:SSE)
|
||||
|
||||
;;; CPU control
|
||||
|
||||
(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||
(declaim (ftype (function (&rest t) (unsigned-byte 32)) cpu-mxcsr-bits))
|
||||
(defun cpu-mxcsr-bits (&rest tags)
|
||||
(loop with mask = 0
|
||||
for tag in tags
|
||||
for bit = (if (listp tag)
|
||||
(apply #'cpu-mxcsr-bits tag)
|
||||
(ecase tag
|
||||
(:except-invalid #x1)
|
||||
(:except-denormal #x2)
|
||||
(:except-divide-zero #x4)
|
||||
(:except-overflow #x8)
|
||||
(:except-underflow #x10)
|
||||
(:except-precision #x20)
|
||||
(:except-all #x3F)
|
||||
(:denormals-are-zero #x40)
|
||||
(:mask-invalid #x80)
|
||||
(:mask-denormal #x100)
|
||||
(:mask-divide-zero #x200)
|
||||
(:mask-overflow #x400)
|
||||
(:mask-underflow #x800)
|
||||
(:mask-precision #x1000)
|
||||
(:mask-all #x1f80)
|
||||
(:round-nearest 0)
|
||||
(:round-negative #x2000)
|
||||
(:round-positive #x4000)
|
||||
(:round-zero #x6000)
|
||||
(:round-bits #x6000)
|
||||
(:flush-to-zero #x8000)))
|
||||
do (setf mask (logior mask bit))
|
||||
finally (return mask)))
|
||||
(defun expand-cpu-mxcsr-bits (tags on-fail)
|
||||
(loop for tag in tags
|
||||
when (keywordp tag) collect tag into kwds
|
||||
else collect tag into rest
|
||||
finally
|
||||
(return
|
||||
(cond ((and kwds rest)
|
||||
`(logior ,(apply #'cpu-mxcsr-bits kwds)
|
||||
(cpu-mxcsr-bits ,@rest)))
|
||||
(kwds
|
||||
(apply #'cpu-mxcsr-bits kwds))
|
||||
(t on-fail))))))
|
||||
|
||||
(define-compiler-macro cpu-mxcsr-bits (&whole whole &rest tags)
|
||||
(expand-cpu-mxcsr-bits tags whole))
|
||||
|
||||
(defmacro with-saved-mxcsr (&body code)
|
||||
(let ((v (gensym "CSR")))
|
||||
`(let ((,v (cpu-mxcsr)))
|
||||
(declare (type (unsigned-byte 32) ,v)
|
||||
#+ecl (:read-only ,v))
|
||||
(unwind-protect (progn ,@code)
|
||||
(%set-cpu-mxcsr ,v)))))
|
||||
|
||||
#+nil
|
||||
(defun cpu-check-exceptions (&rest tags)
|
||||
(let ((mask (logand (cpu-mxcsr-bits (or tags :except-all))
|
||||
(cpu-mxcsr-bits :except-all)))
|
||||
(csr (get-cpu-mxcsr)))
|
||||
(declare (optimize (safety 0) (speed 3) (debug 0))
|
||||
(type fixnum csr mask))
|
||||
(not (zerop (logand mask csr)))))
|
||||
|
||||
#+nil
|
||||
(define-compiler-macro cpu-check-exceptions (&whole whole &rest tags)
|
||||
(let ((bits (expand-cpu-mxcsr-bits (or tags '(except-all)) nil)))
|
||||
(if (integerp bits)
|
||||
`(locally (declare (optimize (speed 3) (safety 0) (debug 0)))
|
||||
(not (zerop (logand (cpu-get-mxcsr)
|
||||
,(logand bits (cpu-mxcsr-bits :except-all))))))
|
||||
whole)))
|
||||
|
||||
#+nil
|
||||
(macrolet ((foo (&rest names)
|
||||
(let* ((kwds (mapcar (lambda (x) (intern (format nil "MASK-~A" x) :keyword)) names))
|
||||
(pvars (mapcar (lambda (x) (intern (format nil "~A-P" x))) names)))
|
||||
`(defun cpu-mask-exceptions (&key
|
||||
,@(mapcar (lambda (n p) `(,n nil ,p)) names pvars)
|
||||
(other nil rest-p))
|
||||
(let ((set-bits (logior ,@(mapcar (lambda (n k) `(if ,n (cpu-mxcsr-bits ,k) 0)) names kwds)))
|
||||
(arg-bits (logior ,@(mapcar (lambda (p k) `(if ,p (cpu-mxcsr-bits ,k) 0)) pvars kwds))))
|
||||
(%set-cpu-mxcsr
|
||||
(the fixnum
|
||||
(if (not rest-p)
|
||||
(logior set-bits (logand (get-cpu-mxcsr) (lognot arg-bits)))
|
||||
(logior set-bits
|
||||
(if other (logand (cpu-mxcsr-bits :mask-all) (lognot arg-bits)) 0)
|
||||
(logiand (get-cpu-mxcsr) (lognot (cpu-mxcsr-bits :mask-all)))))))
|
||||
nil)))))
|
||||
(foo invalid denormal divide-zero overflow underflow precision))
|
||||
|
||||
(defun cpu-configure-rounding (&key round-to
|
||||
(denormals-are-zero nil daz-p)
|
||||
(flush-to-zero nil ftz-p))
|
||||
(let ((set 0)
|
||||
(mask 0))
|
||||
(when round-to
|
||||
(setf mask (cpu-mxcsr-bits :round-bits)
|
||||
set (ecase round-to
|
||||
(:zero (cpu-mxcsr-bits :round-zero))
|
||||
(:negative (cpu-mxcsr-bits :round-negative))
|
||||
(:positive (cpu-mxcsr-bits :round-positive))
|
||||
(:nearest (cpu-mxcsr-bits :round-nearest)))))
|
||||
(when daz-p
|
||||
(setf mask (logior mask (cpu-mxcsr-bits :denormals-are-zero)))
|
||||
(when denormals-are-zero
|
||||
(setf set (logior set (cpu-mxcsr-bits :denormals-are-zero)))))
|
||||
(when ftz-p
|
||||
(setf mask (logior mask (cpu-mxcsr-bits :flush-to-zero)))
|
||||
(when flush-to-zero
|
||||
(setf set (logior set (cpu-mxcsr-bits :flush-to-zero)))))
|
||||
(setf (cpu-mxcsr)
|
||||
(the (unsigned-byte 32)
|
||||
(logior set (logand (cpu-mxcsr) (lognot mask)))))
|
||||
nil))
|
||||
|
||||
156
contrib/cl-simd/test-sfmt.lisp
Normal file
156
contrib/cl-simd/test-sfmt.lisp
Normal file
|
|
@ -0,0 +1,156 @@
|
|||
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
|
||||
;;;
|
||||
;;; Dumbly translated from C code at: http://github.com/jj1bdx/sfmt-extstate
|
||||
|
||||
(eval-when (:compile-toplevel :load-toplevel :execute)
|
||||
(require :cl-simd))
|
||||
|
||||
(defpackage #:sfmt-test
|
||||
(:use #:common-lisp #:sse))
|
||||
|
||||
(in-package #:sfmt-test)
|
||||
|
||||
(deftype uint32 () '(unsigned-byte 32))
|
||||
(deftype uint32-vector () '(sse-array uint32 (*)))
|
||||
|
||||
(defconstant +mexp+ 19937)
|
||||
(defconstant +n+ (1+ (floor +mexp+ 128)))
|
||||
(defconstant +pos1+ 122)
|
||||
(defconstant +sl1+ 18)
|
||||
(defconstant +sl2+ 1)
|
||||
(defconstant +sr1+ 11)
|
||||
(defconstant +sr2+ 1)
|
||||
|
||||
(defconstant +msk1+ #xdfffffef)
|
||||
(defconstant +msk2+ #xddfecb7f)
|
||||
(defconstant +msk3+ #xbffaffff)
|
||||
(defconstant +msk4+ #xbffffff6)
|
||||
|
||||
(defconstant +parity1+ #x00000001)
|
||||
(defconstant +parity2+ #x00000000)
|
||||
(defconstant +parity3+ #x00000000)
|
||||
(defconstant +parity4+ #x13c9e684)
|
||||
|
||||
(defconstant +uint32-mask+ #xFFFFFFFF)
|
||||
|
||||
(defvar *work-buffer* (make-sse-array (* +n+ 4) :element-type 'uint32))
|
||||
|
||||
(defun period-certification (buffer)
|
||||
(declare (type uint32-vector buffer))
|
||||
(let ((inner (logxor (logand (aref buffer 0) +parity1+)
|
||||
(logand (aref buffer 1) +parity2+)
|
||||
(logand (aref buffer 2) +parity3+)
|
||||
(logand (aref buffer 3) +parity4+))))
|
||||
(loop for i = 16 then (ash i -1) while (> i 0)
|
||||
do (setf inner (logxor inner (ash inner (- i)))))
|
||||
(when (logtest inner 1)
|
||||
(return-from period-certification)))
|
||||
(loop
|
||||
for i from 0 to 3
|
||||
for parity in (load-time-value (list +parity1+ +parity2+ +parity3+ +parity4+))
|
||||
do (loop
|
||||
for work = 1 then (ash work 1)
|
||||
for j from 0 below 32
|
||||
when (/= 0 (logand work parity))
|
||||
do (progn
|
||||
(setf (aref buffer i)
|
||||
(logxor (aref buffer i) work))
|
||||
(return-from period-certification)))))
|
||||
|
||||
(defun init-gen-rand (seed buffer)
|
||||
(declare (type uint32 seed)
|
||||
(type uint32-vector buffer))
|
||||
(setf (aref buffer 0) seed)
|
||||
(loop for i from 1 below (array-total-size buffer)
|
||||
do (setf (aref buffer i)
|
||||
(logand +uint32-mask+
|
||||
(+ i
|
||||
(* 1812433253 (logxor (aref buffer (1- i))
|
||||
(ash (aref buffer (1- i)) -30)))))))
|
||||
(period-certification buffer))
|
||||
|
||||
;; Should be an inline function, but it's broken in ECL
|
||||
(defmacro recursion (a b c d mask)
|
||||
`(let ((x ,a)
|
||||
(y (srli-pi32 ,b +sr1+))
|
||||
(z (srli-pi ,c +sr2+))
|
||||
(v (slli-pi32 ,d +sl1+))
|
||||
(m ,mask))
|
||||
(xor-pi (xor-pi (xor-pi z x) v)
|
||||
(xor-pi (slli-pi x +sl2+)
|
||||
(and-pi y m)))))
|
||||
|
||||
(defmacro sfmt-aref (buf idx)
|
||||
`(row-major-aref-api ,buf (the fixnum (* 4 (the fixnum ,idx)))))
|
||||
|
||||
(defun gen-rand-all (buffer)
|
||||
(declare (optimize (speed 3) #+ecl (safety 0) (debug 0)
|
||||
#+sbcl (sb-c::insert-array-bounds-checks 0))
|
||||
(type uint32-vector buffer))
|
||||
#+ecl (check-type buffer uint32-vector)
|
||||
(assert (= (array-total-size buffer) (* +n+ 4)))
|
||||
(let ((mask (set-pu32 +msk4+ +msk3+ +msk2+ +msk1+))
|
||||
(r1 (sfmt-aref buffer (- +n+ 2)))
|
||||
(r2 (sfmt-aref buffer (- +n+ 1))))
|
||||
(declare (type int-sse-pack mask r1 r2))
|
||||
(macrolet ((twist (delta)
|
||||
`(psetq r1 r2
|
||||
r2 (setf (sfmt-aref buffer i)
|
||||
(recursion (sfmt-aref buffer i)
|
||||
(sfmt-aref buffer (+ i (the fixnum ,delta)))
|
||||
r1 r2 mask)))))
|
||||
(loop for i fixnum from 0 below (- +n+ +pos1+)
|
||||
do (twist +pos1+))
|
||||
(loop for i fixnum from (- +n+ +pos1+) below +n+
|
||||
do (twist (- +pos1+ +n+))))))
|
||||
|
||||
(defun gen-rand-array (output buffer)
|
||||
(declare (optimize (speed 3) #+ecl (safety 0) (debug 0)
|
||||
#+sbcl (sb-c::insert-array-bounds-checks 0))
|
||||
(type uint32-vector buffer output))
|
||||
#+ecl (check-type buffer uint32-vector)
|
||||
#+ecl (check-type output uint32-vector)
|
||||
(assert (= (array-total-size buffer) (* +n+ 4)))
|
||||
(let ((mask (set-pu32 +msk4+ +msk3+ +msk2+ +msk1+))
|
||||
(size (floor (array-total-size output) 4))
|
||||
(r1 (sfmt-aref buffer (- +n+ 2)))
|
||||
(r2 (sfmt-aref buffer (- +n+ 1))))
|
||||
(declare (type int-sse-pack mask r1 r2)
|
||||
(type fixnum size))
|
||||
(assert (> size (* +n+ 2)))
|
||||
(macrolet ((twist (tgt src1 delta1 src2 delta2)
|
||||
`(psetq r1 r2
|
||||
r2 (setf (sfmt-aref ,tgt i)
|
||||
(recursion (sfmt-aref ,src1 (- i (the fixnum ,delta1)))
|
||||
(sfmt-aref ,src2 (+ i (the fixnum ,delta2)))
|
||||
r1 r2 mask)))))
|
||||
(loop for i fixnum from 0 below (- +n+ +pos1+)
|
||||
do (twist output buffer 0 buffer +pos1+))
|
||||
(loop for i fixnum from (- +n+ +pos1+) below +n+
|
||||
do (twist output buffer 0 output (- +pos1+ +n+)))
|
||||
(loop for i fixnum from +n+ below (- size +n+)
|
||||
do (twist output output +n+ output (- +pos1+ +n+)))
|
||||
#+ ()
|
||||
(loop for j fixnum from 0 below (- (* 2 +n+) size)
|
||||
do (setf (sfmt-aref buffer j)
|
||||
(sfmt-aref output (+ j (the fixnum (- size +n+))))))
|
||||
(loop
|
||||
for i fixnum from (- size +n+) below size
|
||||
for j fixnum from 0 below +n+ ;(max 0 (- (* 2 +n+) size))
|
||||
do (twist output output +n+ output (- +pos1+ +n+))
|
||||
do (setf (sfmt-aref buffer j) r2))
|
||||
output)))
|
||||
|
||||
(defun test ()
|
||||
(let ((out (make-sse-array 10000 :element-type 'uint32)))
|
||||
(init-gen-rand 1234 *work-buffer*)
|
||||
(gen-rand-array out *work-buffer*)
|
||||
(assert (equal (coerce (subseq out 995 1000) 'list)
|
||||
'(2499610950 3057240914 1662679783 461224431 1168395933)))
|
||||
(gen-rand-array out *work-buffer*)
|
||||
(assert (equal (coerce (subseq out 995 1000) 'list)
|
||||
'(648219337 458306832 3674950976 4030368244 2918117049)))))
|
||||
|
||||
(dotimes (i 10)
|
||||
(test))
|
||||
|
||||
Loading…
Add table
Reference in a new issue