Added contributed cl-simd module by Alexander Gavrilov

This commit is contained in:
Juan Jose Garcia Ripoll 2010-10-03 23:49:58 +02:00
parent 0b0c437281
commit 29f46d0387
14 changed files with 3421 additions and 0 deletions

25
contrib/cl-simd/LICENSE Normal file
View file

@ -0,0 +1,25 @@
(This is the MIT / X Consortium license as taken from
http://www.opensource.org/licenses/mit-license.html on or about
Monday; July 13, 2009)
Copyright (c) 2010 by Alexander Gavrilov
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, sublicense, and/or sell copies of the Software, and to
permit persons to whom the Software is furnished to do so, subject to
the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

8
contrib/cl-simd/README Normal file
View file

@ -0,0 +1,8 @@
This module implements SSE intrinsic functions for ECL and SBCL.
NOTE: CURRENTLY THIS SHOULD BE CONSIDERED EXPERIMENTAL, AND
SUBJECT TO INCOMPATIBLE CHANGES IN A FUTURE RELEASE.
Since the implementation is closely tied to the internals of
the compiler, it should normally be obtained exclusively via
the bundled contrib mechanism of the above implementations.

View file

@ -0,0 +1,44 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Copyright (C) 2010, Alexander Gavrilov (angavrilov@gmail.com)
;;;
;;; This file defines the cl-simd ASDF system.
;;;
;;; Note that a completely independent definition
;;; is used to build the system as an ECL contrib.
(defsystem :cl-simd
:version "1.0"
#+sb-building-contrib :pathname
#+sb-building-contrib #p"SYS:CONTRIB;CL-SIMD;"
:components
#+(and sbcl sb-sse-intrinsics)
((:file "sse-package")
(:file "sbcl-core" :depends-on ("sse-package"))
(:file "sse-intrinsics" :depends-on ("sbcl-core"))
(:file "sbcl-functions" :depends-on ("sse-intrinsics"))
(:file "sbcl-arrays" :depends-on ("sbcl-functions"))
(:file "sse-array-defs" :depends-on ("sbcl-arrays"))
(:file "sse-utils" :depends-on ("sse-array-defs")))
#+(and ecl sse2)
((:file "sse-package")
(:file "ecl-sse-core" :depends-on ("sse-package"))
(:file "sse-intrinsics" :depends-on ("ecl-sse-core"))
(:file "sse-array-defs" :depends-on ("sse-intrinsics"))
(:file "ecl-sse-utils" :depends-on ("sse-intrinsics"))
(:file "sse-utils" :depends-on ("ecl-sse-utils")))
#-(or (and sbcl sb-sse-intrinsics)
(and ecl sse2))
())
#+(or (and sbcl sb-sse-intrinsics)
(and ecl sse2))
(defmethod perform :after ((o load-op) (c (eql (find-system :cl-simd))))
(provide :cl-simd))
(defmethod perform ((o test-op) (c (eql (find-system :cl-simd))))
#+(or (and sbcl sb-sse-intrinsics)
(and ecl sse2))
(or (load (compile-file "test-sfmt.lisp"))
(error "test-sfmt failed")))

View file

@ -0,0 +1,250 @@
@node cl-simd
@section cl-simd
@cindex SSE2 Intrinsics
@cindex Intrinsics, SSE2
The @code{cl-simd} module provides access to SSE2 instructions
(which are nowadays supported by any CPU compatible with x86-64)
in the form of @emph{intrinsic functions}, similar to the way
adopted by modern C compilers. It also provides some lisp-specific
functionality, like setf-able intrinsics for accessing lisp arrays.
When this module is loaded, it defines an @code{:sse2} feature,
which can be subsequently used for conditional compilation of
code that depends on it. Intrinsic functions are available from
the @code{sse} package.
This API, with minor technical differences, is supported by both
ECL and SBCL (x86-64 only).
@menu
* SSE pack types::
* SSE array type::
* Differences from C intrinsics::
* Simple extensions::
* Lisp array accessors::
* Example::
@end menu
@node SSE pack types
@subsection SSE pack types
The package defines and/or exports the following types to
represent 128-bit SSE register contents:
@anchor{Type sse:sse-pack}
@deftp {Type} @somepkg{sse-pack,sse} @&optional item-type
The generic SSE pack type.
@end deftp
@anchor{Type sse:int-sse-pack}
@deftp {Type} @somepkg{int-sse-pack,sse}
Same as @code{(sse-pack integer)}.
@end deftp
@anchor{Type sse:float-sse-pack}
@deftp {Type} @somepkg{float-sse-pack,sse}
Same as @code{(sse-pack single-float)}.
@end deftp
@anchor{Type sse:double-sse-pack}
@deftp {Type} @somepkg{double-sse-pack,sse}
Same as @code{(sse-pack double-float)}.
@end deftp
Declaring variable types using the subtype appropriate
for your data is likely to lead to more efficient code
(especially on ECL). However, the compiler implicitly
casts between any subtypes of sse-pack when needed.
Printed representation of SSE packs can be controlled
by binding @code{*sse-pack-print-mode*}:
@anchor{Variable sse:*sse-pack-print-mode*}
@defvr {Variable} @somepkg{@earmuffs{sse-pack-print-mode},sse}
When set to one of @code{:int}, @code{:float} or
@code{:double}, specifies the way SSE packs are
printed. A @code{NIL} value (default) instructs
the implementation to make its best effort to
guess from the data and context.
@end defvr
@node SSE array type
@subsection SSE array type
@anchor{Type sse:sse-array}
@deftp {Type} @somepkg{sse-array,sse} element-type @&optional dimensions
Expands to a lisp array type that is efficiently
supported by AREF-like accessors.
It should be assumed to be a subtype of @code{SIMPLE-ARRAY}.
The type expander signals warnings or errors if it detects
that the element-type argument value is inappropriate or unsafe.
@end deftp
@anchor{Function sse:make-sse-array}
@deffn {Function} @somepkg{make-sse-array,sse} dimensions @&key element-type initial-element displaced-to displaced-index-offset
Creates an object of type @code{sse-array}, or signals an error.
In non-displaced case ensures alignment of the beginning of data to
the 16-byte boundary.
Unlike @code{make-array}, the element type defaults to (unsigned-byte 8).
@end deffn
On ECL this function supports full-featured displacement.
On SBCL it has to simulate it by sharing the underlying
data vector, and does not support nonzero index offset.
@node Differences from C intrinsics
@subsection Differences from C intrinsics
Intel Compiler, GCC and
@url{http://msdn.microsoft.com/en-us/library/y0dh78ez%28VS.80%29.aspx,MSVC}
all support the same set
of SSE intrinsics, originally designed by Intel. This
package generally follows the naming scheme of the C
version, with the following exceptions:
@itemize
@item
Underscores are replaced with dashes, and the @code{_mm_}
prefix is removed in favor of packages.
@item
The 'e' from @code{epi} is dropped because MMX is obsolete
and won't be supported.
@item
@code{_si128} functions are renamed to @code{-pi} for uniformity
and brevity. The author has personally found this discrepancy
in the original C intrinsics naming highly jarring.
@item
Comparisons are named using graphic characters, e.g. @code{<=-ps}
for @code{cmpleps}, or @code{/>-ps} for @code{cmpngtps}. In some
places the set of comparison functions is extended to cover the
full possible range.
@item
Conversion functions are renamed to @code{convert-*-to-*} and
@code{truncate-*-to-*}.
@item
A few functions are completely renamed: @code{cpu-mxcsr} (setf-able),
@code{cpu-pause}, @code{cpu-load-fence}, @code{cpu-store-fence},
@code{cpu-memory-fence}, @code{cpu-clflush}, @code{cpu-prefetch-*}.
@end itemize
In addition, foreign pointer access intrinsics have an additional
optional integer offset parameter to allow more efficient coding
of pointer deference, and the most common ones have been renamed
and made SETF-able:
@itemize
@item
@code{mem-ref-ss}, @code{mem-ref-ps}, @code{mem-ref-aps}
@item
@code{mem-ref-sd}, @code{mem-ref-pd}, @code{mem-ref-apd}
@item
@code{mem-ref-pi}, @code{mem-ref-api}, @code{mem-ref-si64}
@end itemize
(The @code{-ap*} version requires alignment.)
@node Simple extensions
@subsection Simple extensions
This module extends the set of basic intrinsics with the following
simple compound functions:
@itemize
@item
@code{neg-ss}, @code{neg-ps}, @code{neg-sd}, @code{neg-pd},
@code{neg-pi8}, @code{neg-pi16}, @code{neg-pi32}, @code{neg-pi64}:
implement numeric negation of the corresponding data type.
@item
@code{not-ps}, @code{not-pd}, @code{not-pi}:
implement bitwise logical inversion.
@item
@code{if-ps}, @code{if-pd}, @code{if-pi}:
perform element-wise combining of two values based on a boolean
condition vector produced as a combination of comparison function
results through bitwise logical functions.
The condition value must use all-zero bitmask for false, and
all-one bitmask for true as a value for each logical vector
element. The result is undefined if any other bit pattern is used.
N.B.: these are @emph{functions}, so both branches of the
conditional are always evaluated.
@end itemize
The module also provides symbol macros that expand into expressions
producing certain constants in the most efficient way:
@itemize
@item
0.0-ps 0.0-pd 0-pi for zero
@item
true-ps true-pd true-pi for all 1 bitmask
@item
false-ps false-pd false-pi for all 0 bitmask (same as zero)
@end itemize
@node Lisp array accessors
@subsection Lisp array accessors
In order to provide better integration with ordinary lisp code,
this module implements a set of AREF-like memory accessors:
@itemize
@item
@code{(ROW-MAJOR-)?AREF-PREFETCH-(T0|T1|T2|NTA)} for cache prefetch.
@item
@code{(ROW-MAJOR-)?AREF-CLFLUSH} for cache flush.
@item
@code{(ROW-MAJOR-)?AREF-[AS]?P[SDI]} for whole-pack read & write.
@end itemize
(Where A = aligned; S = aligned streamed write.)
These accessors can be used with any non-bit specialized
array or vector, without restriction on the precise element
type (although it should be declared at compile time to
ensure generation of the fastest code).
Additional index bound checking is done to ensure that 16
bytes of memory are accessible after the specified index.
As an exception, ROW-MAJOR-AREF-PREFETCH-* does not do any
range checks at all, because the prefetch instructions
are officially safe to use with bad addresses. The
AREF-PREFETCH-* and *-CLFLUSH functions do only ordinary
index checks without the 16-byte extension.
@node Example
@subsection Example
This code processes several single-float arrays, storing
either the value of a*b, or c/3.5 into result, depending
on the sign of mode:
@example
(loop for i from 0 below 128 by 4
do (setf (aref-ps result i)
(if-ps (<-ps (aref-ps mode i) 0.0-ps)
(mul-ps (aref-ps a i) (aref-ps b i))
(div-ps (aref-ps c i) (set1-ps 3.5)))))
@end example
As already noted above, both branches of the if are always
evaluated.

View file

@ -0,0 +1,310 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
;;;
;;; This file defines macros for wrapping C-level SSE intrinsics.
;;;
(in-package #:SSE)
;;; The compound SSE pack type
(deftype sse-pack (&optional item)
(ecase item
(* 'ext:sse-pack)
((single-float float) 'float-sse-pack)
(double-float 'double-sse-pack)
(integer 'int-sse-pack)))
;;; Helper macros and functions
(defmacro typename-case (value &body clauses)
"Syntax: (case value &body clauses)"
`(cond ,@(mapcar (lambda (clause)
`((subtypep ,value ',(first clause))
,@(rest clause)))
clauses)
(t (error "Unsupported type name: ~S" ,value))))
(defun foreign-type-of (lt)
(typename-case lt
(nil :object)
(int-sse-pack :int-sse-pack)
(float-sse-pack :float-sse-pack)
(double-sse-pack :double-sse-pack)
(single-float :float)
(double-float :double)
(fixnum :fixnum)
#+uint32-t
(ext:integer32 :int32-t)
#+uint32-t
(ext:byte32 :uint32-t)
#+uint64-t
(ext:integer64 :int64-t)
#+uint64-t
(ext:byte64 :uint64-t)
(integer :fixnum)))
(defun pointer-c-type-of (lt)
(typename-case lt
(nil "void")
(int-sse-pack "__m128i")
(float-sse-pack "float")
(double-sse-pack "double")))
;; Accept any real values for floating-point arguments:
(defun declaim-arg-type-of (lt)
(typename-case lt
((or single-float double-float) 'real)
(ext:sse-pack 'ext:sse-pack)
(fixnum 'fixnum)
(t lt)))
(defun inline-arg-type-of (lt)
(typename-case lt
((or single-float double-float) 'c::fixnum-float)
(fixnum 'fixnum)
(t lt)))
;; Constant expansion
(defun expand-constant (form env &optional chgp)
(let* ((mform (macroexpand form env))
(cform (cond ((and (symbolp mform) (constantp mform))
(symbol-value mform))
(t mform))))
(values cform (or chgp (not (eql cform form))))))
;; Macro helpers
(defun make-arg-name (index)
(intern (format nil "ARG~A" index)))
(defun make-arg-nums (lst)
(loop for i from 0 below (length lst) collect i))
(defun wrap-ret-arg (core ret-type &optional ret-arg)
(cond ((eq ret-type nil)
(format nil "(~A,Cnil)" core))
(ret-arg
(format nil "@~36R;(~A,#~36R)" ret-arg core ret-arg))
(t core)))
;; Constant generation
(defun make-pack-of-bin (bin-value &key (as 'int-sse-pack))
(let* ((all (loop for i from 0 to 15
for v = bin-value then (ash v -8)
collect (logand v 255)))
(pack (ext:vector-to-sse-pack
(make-array 16 :element-type '(unsigned-byte 8) :initial-contents all))))
(if (eq as 'int-sse-pack)
pack
`(the ,as ,(ext:sse-pack-as-elt-type
pack (ecase as
(EXT:FLOAT-SSE-PACK 'single-float)
(EXT:DOUBLE-SSE-PACK 'double-float)))))))
(defmacro def-inline (name mode arg-types ret-type call-str &rest flags)
`(eval-when (:compile-toplevel :load-toplevel)
(c::def-inline ',name ',mode ',arg-types ',ret-type ,call-str ,@flags)))
(defmacro def-intrinsic (name arg-types ret-type c-name
&key (export t) ret-arg reorder-args immediate-args)
"Defines and exports an SSE intrinsic function with matching open-coding rules."
(let* ((anums (make-arg-nums arg-types))
(asyms (mapcar #'make-arg-name anums))
(aftypes (mapcar #'foreign-type-of arg-types))
(rftype (foreign-type-of ret-type))
(call-anums (if reorder-args (reverse anums) anums))
(call-str (wrap-ret-arg (format nil "~A(~{#~36R~^,~})" c-name call-anums) ret-type ret-arg)))
`(progn
,(if export `(export ',name))
,@(if immediate-args ; Generate a constantness verifier macro
`((define-compiler-macro ,name (&whole whole &environment env ,@asyms &aux chgp)
,@(loop for (arg type) in immediate-args
collect `(let ((oldv ,arg))
(multiple-value-setq (,arg chgp) (expand-constant oldv env chgp))
(unless (typep ,arg ',type)
(c::cmperr "In call to ~A: Argument ~S~@[ = ~S~] is not a constant of type ~A"
',name oldv (unless (eql oldv ,arg) ,arg) ',type))))
(if chgp (list ',name ,@asyms) whole))))
(proclaim '(ftype (function ,(mapcar #'declaim-arg-type-of arg-types) ,(or ret-type 'null)) ,name))
,@(if (null immediate-args)
`((defun ,name ,asyms
(declare (optimize (speed 0) (debug 0) (safety 1)))
(ffi:c-inline ,asyms ,aftypes ,rftype ,call-str :one-liner t))))
(def-inline ,name :always ,(mapcar #'inline-arg-type-of arg-types) ,rftype
,call-str :inline-or-warn t))))
(defmacro def-unary-intrinsic (name ret-type insn cost c-name
&key (arg-type ret-type) partial result-size immediate-arg)
(declare (ignore insn cost partial result-size))
`(def-intrinsic ,name (,arg-type ,@(if immediate-arg (list immediate-arg)))
,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg1 ,immediate-arg)))))
(defmacro def-cvt-to-int32-intrinsic (name ret-type insn cost c-name
&key (arg-type ret-type) partial immediate-arg)
(declare (ignore insn cost partial))
(assert (subtypep ret-type '(signed-byte 32)))
`(def-intrinsic ,name (,arg-type ,@(if immediate-arg (list immediate-arg)))
,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg1 ,immediate-arg)))))
(defmacro def-binary-intrinsic (name ret-type insn cost c-name
&key (x-type ret-type) (y-type ret-type)
commutative tags immediate-arg)
(declare (ignore insn cost commutative tags))
`(def-intrinsic ,name (,x-type ,y-type ,@(if immediate-arg (list immediate-arg)))
,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg2 ,immediate-arg)))))
(defmacro def-sse-int-intrinsic (name int-type ret-type insn cost c-name
&key (arg-type ret-type) immediate-arg make-temporary)
(declare (ignore insn cost make-temporary))
`(def-intrinsic ,name (,arg-type ,int-type ,@(if immediate-arg (list immediate-arg)))
,ret-type ,c-name :immediate-args ,(if immediate-arg `((arg2 ,immediate-arg)))))
(defmacro %def-aref-intrinsic (tag val-type c-type reader writer &key (aux-args "") (bsize 16))
"Defines and exports macros and functios that implement vectorized array access."
(let* ((rftype (foreign-type-of val-type))
(aref-name (intern (format nil "AREF-~A" tag) *package*))
(rm-aref-name (intern (format nil "ROW-MAJOR-AREF-~A" tag) *package*))
(rm-aset-name (intern (format nil "ROW-MAJOR-ASET-~A" tag) *package*))
(known-elt-types '((single-float "sf")
(double-float "df")
(ext:byte8 "b8")
(ext:integer8 "i8")
#+uint16-t (ext:byte16 "b16")
#+uint16-t (ext:integer16 "i16")
#+uint32-t (ext:byte32 "b32")
#+uint32-t (ext:integer32 "i32")
#+uint64-t (ext:byte64 "b64")
#+uint64-t (ext:integer64 "i64"))))
(flet ((fmtr (ptr-fmt &rest ptr-args)
(wrap-ret-arg (format nil "~A((~A*)~?~A)"
reader c-type ptr-fmt ptr-args aux-args)
val-type))
(fmtw (ptr-fmt &rest ptr-args)
(wrap-ret-arg (format nil "~A((~A*)~?,#2)"
writer c-type ptr-fmt ptr-args)
val-type 2)))
`(progn
(export ',aref-name)
(export ',rm-aref-name)
(defmacro ,aref-name (array &rest indexes)
(let ((varr (gensym "ARR")))
`(let ((,varr ,array))
(declare (:read-only ,varr))
(,',rm-aref-name ,varr (array-row-major-index ,varr ,@indexes)))))
(proclaim '(ftype (function (array fixnum) ,(or val-type 'null)) ,rm-aref-name))
(defun ,rm-aref-name (array index)
(declare (optimize (speed 0) (debug 0) (safety 2)))
(ffi:c-inline (array index) (:object :int) ,rftype
,(fmtr "ecl_row_major_ptr(#0,#1,~A)" bsize)
:one-liner t))
;; AREF
(def-inline ,rm-aref-name :always (t t) ,rftype
,(fmtr "ecl_row_major_ptr(#0,fixint(#1),~A)" bsize)
:inline-or-warn t)
(def-inline ,rm-aref-name :always (t fixnum) ,rftype
,(fmtr "ecl_row_major_ptr(#0,#1,~A)" bsize))
;; AREF unsafe
,@(mapcar (lambda (spec)
`(def-inline ,rm-aref-name :unsafe ((array ,(first spec)) fixnum) ,rftype
,(fmtr "(&(#0)->array.self.~A[#1])" (second spec))))
known-elt-types)
,@(if writer
`((define-setf-expander ,aref-name (array &rest indexes)
(let ((varr (gensym)) (vidx (gensym)) (vval (gensym)))
(values (list varr vidx)
(list array `(array-row-major-index ,varr ,@indexes))
(list vval)
`(,',rm-aset-name ,varr ,vidx ,vval) `(,',rm-aref-name ,varr ,vidx))))
(proclaim '(ftype (function (array fixnum ,(declaim-arg-type-of val-type)) ,val-type) ,rm-aset-name))
(defun ,rm-aset-name (array index value)
(declare (optimize (speed 0) (debug 0) (safety 2)))
(prog1 value
(ffi:c-inline (array index value) (:object :int ,rftype) :void
,(fmtw "ecl_row_major_ptr(#0,#1,~A)" bsize)
:one-liner t)))
(defsetf ,rm-aref-name ,rm-aset-name)
;; ASET
(def-inline ,rm-aset-name :always (t t ,val-type) ,rftype
,(fmtw "ecl_row_major_ptr(#0,fixint(#1),~A)" bsize)
:inline-or-warn t)
(def-inline ,rm-aset-name :always (t fixnum ,val-type) ,rftype
,(fmtw "ecl_row_major_ptr(#0,#1,~A)" bsize))
;; ASET unsafe
,@(mapcar (lambda (spec)
`(def-inline ,rm-aset-name :unsafe ((array ,(first spec)) fixnum ,val-type) ,rftype
,(fmtw "(&(#0)->array.self.~A[#1])" (second spec))))
known-elt-types)))))))
(defmacro def-aref-intrinsic (tag val-type reader-fun writer-fun &key (check-bounds t))
`(%def-aref-intrinsic ,tag ,val-type ,(pointer-c-type-of val-type)
,(get reader-fun 'c-function-name) ,(get writer-fun 'c-function-name)
:bsize ,(ecase check-bounds
(t 16) ((nil) 0) (:no-gap 1))
:aux-args ,(get reader-fun 'c-call-aux-args)))
(defmacro def-mem-intrinsic (name c-type ret-type c-name &key (public t)
prefix-args (prefix-fmt "~@{#~36R,~}")
postfix-args (postfix-fmt "~@{,#~36R~}" pf-p) ret-arg)
"Defines and exports an SSE memory access intrinsic function with matching open-coding rules."
(let* ((anums (make-arg-nums (append prefix-args postfix-args)))
(asyms (mapcar #'make-arg-name anums))
(prefix-nums (subseq anums 0 (length prefix-args)))
(postfix-nums (mapcar #'1+ (subseq anums (length prefix-args))))
(prefix-syms (subseq asyms 0 (length prefix-args)))
(postfix-syms (subseq asyms (length prefix-args)))
(prefix-itypes (mapcar #'inline-arg-type-of prefix-args))
(postfix-itypes (mapcar #'inline-arg-type-of postfix-args))
(rftype (foreign-type-of ret-type))
(ptr-idx (length prefix-args))
(offset-idx (+ ptr-idx 1 (length postfix-args))))
(flet ((fmt (ptr-text)
(wrap-ret-arg (format nil "~A(~?(~A*)~?~?)"
c-name prefix-fmt prefix-nums
c-type ptr-text (list ptr-idx offset-idx)
postfix-fmt postfix-nums)
ret-type ret-arg)))
`(progn
,(when public `(export ',name))
(eval-when (:compile-toplevel :load-toplevel :execute)
(setf (get ',name 'c-function-name) ,c-name)
,(if (and pf-p (null postfix-args))
`(setf (get ',name 'c-call-aux-args) ,postfix-fmt)))
(proclaim '(ftype (function (,@(mapcar #'declaim-arg-type-of prefix-args) si:foreign-data
,@(mapcar #'declaim-arg-type-of postfix-args) &optional fixnum) ,ret-type) ,name))
(defun ,name (,@prefix-syms ptr ,@postfix-syms &optional (offset 0))
(declare (optimize (speed 0) (debug 0) (safety 1)))
(ffi:c-inline (,@prefix-syms ptr ,@postfix-syms offset)
(,@(mapcar #'foreign-type-of prefix-args) :pointer-void
,@(mapcar #'foreign-type-of postfix-args) :int) ,rftype
,(fmt "(((char*)#~A) + #~A)")
:one-liner t))
(def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes) ,rftype
,(fmt "ecl_to_pointer(#~A)")
:inline-or-warn t)
(def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes t) ,rftype
,(fmt "(((char*)ecl_to_pointer(#~A)) + fixint(#~A))"))
(def-inline ,name :always (,@prefix-itypes t ,@postfix-itypes fixnum) ,rftype
,(fmt "(((char*)ecl_to_pointer(#~A)) + #~A)"))
(def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes) ,rftype
,(fmt "(#~A)->foreign.data"))
(def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes t) ,rftype
,(fmt "(((char*)(#~A)->foreign.data) + fix(#~A))"))
(def-inline ,name :unsafe (,@prefix-itypes si:foreign-data ,@postfix-itypes fixnum) ,rftype
,(fmt "(((char*)(#~A)->foreign.data) + #~A)"))))))
(defmacro def-load-intrinsic (name ret-type insn c-name &key register-arg tags size postfix-fmt)
(declare (ignore insn tags size))
`(def-mem-intrinsic ,name ,(pointer-c-type-of ret-type) ,ret-type ,c-name
:prefix-args ,(if register-arg (list ret-type))
:postfix-fmt ,(or postfix-fmt "")))
(defmacro def-store-intrinsic (name ret-type insn c-name &key setf-name)
(declare (ignore insn))
`(progn
(def-mem-intrinsic ,name ,(pointer-c-type-of ret-type) ,ret-type ,c-name
:public ,(not setf-name) :postfix-args (,ret-type) :ret-arg 1)
,(if setf-name
`(defsetf ,setf-name (pointer &optional (offset 0)) (value)
`(,',name ,pointer ,value ,offset)))))

View file

@ -0,0 +1,398 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
;;;
;;; This file defines some extensions to the base intrinsic set,
;;; and other utility functions.
;;;
(in-package #:SSE)
;;; Helper macros and functions
(eval-when (:compile-toplevel :load-toplevel :execute)
;; Try using a matching inverse function name
(defun lookup-flip (arg pairs &key no-reverse)
(and (consp arg)
(let ((fix (or (cdr (assoc (first arg) pairs))
(unless no-reverse
(car (rassoc (first arg) pairs))))))
(cond ((eq fix :identity)
(assert (null (cddr arg)))
(second arg))
(fix
`(,fix ,@(rest arg)))
(t nil)))))
;; Macroexpand, plus compiler expand some specific names
(defun expand-condition (form env)
(setq form (macroexpand form env))
(loop while (and (consp form)
(symbolp (first form))
(get (first form) 'expand-in-condition))
do (setq form (c::cmp-expand-macro (compiler-macro-function (first form))
form env)))
form)
;; Checks if the form is an unary call
(defun is-unary? (form op)
(and (consp form)
(eq (first form) op)
(null (cddr form))))
;; IF-style function expander
(defun expand-if-macro (condition then-value else-value env if-f not-f or-f and-f andnot-f type-name zero-val &key flip)
(let* ((condition (expand-condition condition env))
(then-value (macroexpand then-value env))
(else-value (macroexpand else-value env))
(then-zero? (equal then-value zero-val))
(else-zero? (equal else-value zero-val)))
(cond ((is-unary? condition not-f)
(expand-if-macro (second condition) else-value then-value
env if-f not-f or-f and-f andnot-f type-name zero-val
:flip (not flip)))
((and then-zero? else-zero?)
zero-val)
(then-zero?
`(,andnot-f ,condition ,else-value))
(else-zero?
`(,and-f ,condition ,then-value))
(t
(let* ((csym (gensym))
(args `((,and-f ,csym ,then-value)
(,andnot-f ,csym ,else-value))))
`(let ((,csym ,condition))
(declare (type ,type-name ,csym)
(:read-only ,csym))
(,or-f ,@(if flip (reverse args) args)))))))))
(defmacro def-utility (name arg-types ret-type expansion &key expand-args expand-in-condition)
"Defines and exports a function & compiler macro with the specified expansion."
(let* ((anames (mapcar #'make-arg-name (make-arg-nums arg-types))))
`(progn
(export ',name)
(eval-when (:compile-toplevel :load-toplevel)
,@(if expand-in-condition
`((setf (get ',name 'expand-in-condition) t)))
(define-compiler-macro ,name (&environment env ,@anames)
(declare (ignorable env))
,@(loop for arg in (if (eq expand-args t) anames expand-args)
collect `(setq ,arg (macroexpand ,arg env)))
,expansion))
(proclaim '(ftype (function ,(mapcar #'declaim-arg-type-of arg-types) ,ret-type) ,name))
(defun ,name ,anames
(declare (optimize (speed 0) (debug 0) (safety 1)))
(let ,(mapcar #'list anames anames)
(declare ,@(loop for an in anames and at in arg-types
collect `(type ,at ,an)))
;; Depends on the compiler macro being expanded:
(,name ,@anames))))))
(defmacro def-if-function (name type-name postfix)
`(def-utility ,name (,type-name ,type-name ,type-name) ,type-name
(expand-if-macro arg0 arg1 arg2 env
',name
',(intern (format nil "NOT-~A" postfix))
',(intern (format nil "OR-~A" postfix))
',(intern (format nil "AND-~A" postfix))
',(intern (format nil "ANDNOT-~A" postfix))
',type-name
'(,(intern (format nil "SETZERO-~A" postfix))))))
;;; Aligned array allocation
(deftype sse-array (elt-type &optional dims)
"Type of arrays efficiently accessed by SSE aref intrinsics and returned by make-sse-array.
Should be assumed to be SIMPLE-ARRAY, except that displacing with MAKE-SSE-ARRAY is allowed."
(when (eq elt-type '*)
(c::cmperr "SSE-ARRAY must have a specific element type."))
(let ((upgraded (upgraded-array-element-type elt-type)))
(when (member upgraded '(t bit))
(c::cmperr "Invalid SSE-ARRAY element type: ~S" elt-type))
(unless (subtypep upgraded elt-type)
(c::cmpwarn "SSE-ARRAY element type ~S has been upgraded to ~S" elt-type upgraded))
`(array ,upgraded ,dims)))
(defun make-sse-array (dimensions &rest args &key (element-type '(unsigned-byte 8)) displaced-to &allow-other-keys)
"Allocates an SSE-ARRAY aligned to the 16-byte boundary. May flatten displacement chains for performance reasons."
(if displaced-to
(apply #'make-array dimensions args)
(multiple-value-bind (elt-size adj-type)
(array-element-type-byte-size element-type)
(when (eq adj-type t)
(error "Cannot use element type T with SSE."))
(sys::remf args :element-type)
(let* ((full-size (if (numberp dimensions)
dimensions
(reduce #'* dimensions)))
(padded-size (+ full-size (ceiling 15 elt-size)))
(array (apply #'make-array padded-size :element-type adj-type args))
(misalign (ffi:c-inline (array) (:object) :int
"(((unsigned long)(#0)->array.self.b8) & 15)"
:one-liner t))
(offset (/ (if (> misalign 0) (- 16 misalign) 0) elt-size)))
(make-array dimensions :element-type element-type
:displaced-to array :displaced-index-offset offset)))))
;;; Single-float tools
;; Constants
(defmacro set-true-ss ()
(load-time-value (make-pack-of-bin #xFFFFFFFF :as 'float-sse-pack)))
(defmacro set-true-ps ()
(load-time-value (make-pack-of-bin -1 :as 'float-sse-pack)))
(eval-when (:compile-toplevel :load-toplevel)
(define-symbol-macro 0.0-ps (setzero-ps))
(define-symbol-macro true-ss (set-true-ss))
(define-symbol-macro false-ss (setzero-ps))
(define-symbol-macro true-ps (set-true-ps))
(define-symbol-macro false-ps (setzero-ps)))
;; Bitwise if
(def-if-function if-ps float-sse-pack #:ps)
;; Arithmetic negation (xor with negative zero)
(def-utility neg-ss (float-sse-pack) float-sse-pack
`(xor-ps ,arg0 ,(load-time-value (make-pack-of-bin #x80000000 :as 'float-sse-pack))))
(def-utility neg-ps (float-sse-pack) float-sse-pack
`(xor-ps ,arg0 ,(load-time-value
(make-pack-of-bin #x80000000800000008000000080000000 :as 'float-sse-pack))))
;; Logical inversion
(def-utility not-ps (float-sse-pack) float-sse-pack
(or (lookup-flip arg0 '((=-ps . /=-ps)
(<-ps . /<-ps)
(<=-ps . /<=-ps)
(>-ps . />-ps)
(>=-ps . />=-ps)
(cmpord-ps . cmpunord-ps)
(not-ps . :identity)))
`(xor-ps ,arg0 true-ps))
:expand-args t)
;; Shuffle
(defun shuffle-ps (x y mask)
(declare (optimize (speed 0) (debug 0) (safety 1))
(type t x y mask))
(check-type x sse-pack)
(check-type y sse-pack)
(check-type mask (unsigned-byte 8))
(ffi:c-inline (x y mask) (:object :object :int) :float-sse-pack
"_mm_setr_ps(
(#0)->sse.data.sf[(#2)&3],
(#0)->sse.data.sf[((#2)>>2)&3],
(#1)->sse.data.sf[((#2)>>4)&3],
(#1)->sse.data.sf[((#2)>>6)&3]
)" :one-liner t))
;;; Double-float tools
;; Constants
(defmacro set-true-sd ()
(load-time-value (make-pack-of-bin #xFFFFFFFFFFFFFFFF :as 'double-sse-pack)))
(defmacro set-true-pd ()
(load-time-value (make-pack-of-bin -1 :as 'double-sse-pack)))
(eval-when (:compile-toplevel :load-toplevel)
(define-symbol-macro 0.0-pd (setzero-pd))
(define-symbol-macro true-sd (set-true-sd))
(define-symbol-macro false-sd (setzero-pd))
(define-symbol-macro true-pd (set-true-pd))
(define-symbol-macro false-pd (setzero-pd)))
;; Bitwise if
(def-if-function if-pd double-sse-pack #:pd)
;; Arithmetic negation (xor with negative zero)
(def-utility neg-sd (double-sse-pack) double-sse-pack
`(xor-pd ,arg0
,(load-time-value
(make-pack-of-bin #x8000000000000000 :as 'double-sse-pack))))
(def-utility neg-pd (double-sse-pack) double-sse-pack
`(xor-pd ,arg0
,(load-time-value
(make-pack-of-bin #x80000000000000008000000000000000 :as 'double-sse-pack))))
;; Logical inversion
(def-utility not-pd (double-sse-pack) double-sse-pack
(or (lookup-flip arg0 '((=-pd . /=-pd)
(<-pd . /<-pd)
(<=-pd . /<=-pd)
(>-pd . />-pd)
(>=-pd . />=-pd)
(cmpord-pd . cmpunord-pd)
(not-pd . :identity)))
`(xor-pd ,arg0 true-pd))
:expand-args t)
;; Shuffle
(defun shuffle-pd (x y mask)
(declare (optimize (speed 0) (debug 0) (safety 1))
(type t x y mask))
(check-type x sse-pack)
(check-type y sse-pack)
(check-type mask (unsigned-byte 2))
(ffi:c-inline (x y mask) (:object :object :int) :double-sse-pack
"_mm_setr_pd(
(#0)->sse.data.df[(#2)&1],
(#1)->sse.data.df[((#2)>>1)&1]
)" :one-liner t))
;;; Integer tools
;; Constants
(defmacro set-true-pi ()
(load-time-value (make-pack-of-bin -1 :as 'int-sse-pack)))
(eval-when (:compile-toplevel :load-toplevel)
(define-symbol-macro 0-pi (setzero-pi))
(define-symbol-macro true-pi (set-true-pi))
(define-symbol-macro false-pi (setzero-pi)))
;; Bitwise if
(def-if-function if-pi float-sse-pack #:pi)
;; Arithmetic negation (subtract from 0)
(macrolet ((frob (name subf)
`(def-utility ,name (int-sse-pack) int-sse-pack
`(,',subf (setzero-pi) ,arg0))))
(frob neg-pi8 sub-pi8)
(frob neg-pi16 sub-pi16)
(frob neg-pi32 sub-pi32)
(frob neg-pi64 sub-pi64))
;; Logical inversion
(def-utility not-pi (int-sse-pack) int-sse-pack
(or (lookup-flip arg0 '((<=-pi8 . >-pi8)
(<=-pi16 . >-pi16)
(<=-pi32 . >-pi32)
(>=-pi8 . <-pi8)
(>=-pi16 . <-pi16)
(>=-pi32 . <-pi32)
(/=-pi8 . =-pi8)
(/=-pi16 . =-pi16)
(/=-pi32 . =-pi32)
(not-pi . :identity))
:no-reverse t)
`(xor-pi ,arg0 true-pi))
:expand-args t)
(macrolet ((frob (name code)
`(def-utility ,name (int-sse-pack int-sse-pack) int-sse-pack
,code
:expand-in-condition t)))
(frob <=-pi8 `(not-pi (>-pi8 ,arg0 ,arg1)))
(frob <=-pi16 `(not-pi (>-pi16 ,arg0 ,arg1)))
(frob <=-pi32 `(not-pi (>-pi32 ,arg0 ,arg1)))
(frob >=-pi8 `(not-pi (<-pi8 ,arg0 ,arg1)))
(frob >=-pi16 `(not-pi (<-pi16 ,arg0 ,arg1)))
(frob >=-pi32 `(not-pi (<-pi32 ,arg0 ,arg1)))
(frob /=-pi8 `(not-pi (=-pi8 ,arg0 ,arg1)))
(frob /=-pi16 `(not-pi (=-pi16 ,arg0 ,arg1)))
(frob /=-pi32 `(not-pi (=-pi32 ,arg0 ,arg1))))
;; Shifts
(defun slli-pi (x shift)
(declare (optimize (speed 0) (debug 0) (safety 1))
(type t x shift))
(check-type x sse-pack)
(check-type shift (unsigned-byte 8))
(ffi:c-inline (x shift) (:object :int) :object
"cl_object rv = ecl_make_int_sse_pack(_mm_setzero_si128());
unsigned bshift=(#1), i;
for (i = 0; i + bshift < 16; i++)
rv->sse.data.b8[i+bshift] = (#0)->sse.data.b8[i];
@(return) = rv;"))
(defun srli-pi (x shift)
(declare (optimize (speed 0) (debug 0) (safety 1))
(type t x shift))
(check-type x sse-pack)
(check-type shift (unsigned-byte 8))
(ffi:c-inline (x shift) (:object :int) :object
"cl_object rv = ecl_make_int_sse_pack(_mm_setzero_si128());
int bshift=(#1), i;
for (i = 16 - bshift - 1; i >= 0; i--)
rv->sse.data.b8[i] = (#0)->sse.data.b8[i+bshift];
@(return) = rv;"))
;; Extract & insert
(defun extract-pi16 (x index)
(declare (optimize (speed 0) (debug 0) (safety 1))
(type t x index))
(check-type x sse-pack)
(check-type index (unsigned-byte 8))
(ffi:c-inline (x index) (:object :int) :fixnum
"*((unsigned short*)&(#0)->sse.data.b8[((#1)&3)*2])"
:one-liner t))
(defun insert-pi16 (x ival index)
(declare (optimize (speed 0) (debug 0) (safety 1))
(type t x ival index))
(check-type x sse-pack)
(check-type index (unsigned-byte 8))
(ffi:c-inline (x ival index) (:int-sse-pack :int :int) :object
"cl_object rv = ecl_make_int_sse_pack(#0);
*((unsigned short*)&rv->sse.data.b8[((#2)&3)*2]) = (unsigned short)(#1);
@(return) = rv;"))
;; Shuffles
(defun shuffle-pi32 (x mask)
(declare (optimize (speed 0) (debug 0) (safety 1))
(type t x mask))
(check-type x sse-pack)
(check-type mask (unsigned-byte 8))
(ffi:c-inline (x mask) (:object :int) :int-sse-pack
"unsigned *pd = (unsigned*)(#0)->sse.data.b8;
@(return) = _mm_setr_epi32(pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[((#1)>>6)&3]);"))
(defun shufflelo-pi16 (x mask)
(declare (optimize (speed 0) (debug 0) (safety 1))
(type t x mask))
(check-type x sse-pack)
(check-type mask (unsigned-byte 8))
(ffi:c-inline (x mask) (:object :int) :int-sse-pack
"unsigned short *pd = (unsigned short*)(#0)->sse.data.b8;
@(return) = _mm_setr_epi16(
pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[(((#1)>>6)&3)],
pd[4], pd[5], pd[6], pd[7]
);"))
(defun shufflehi-pi16 (x mask)
(declare (optimize (speed 0) (debug 0) (safety 1))
(type t x mask))
(check-type x sse-pack)
(check-type mask (unsigned-byte 8))
(ffi:c-inline (x mask) (:object :int) :int-sse-pack
"unsigned short *pb = (unsigned short*)(#0)->sse.data.b8, *pd = pb+4;
@(return) = _mm_setr_epi16(
pb[0], pb[1], pb[2], pb[3],
pd[(#1)&3],pd[((#1)>>2)&3],pd[((#1)>>4)&3],pd[(((#1)>>6)&3)]
);"))

View file

@ -0,0 +1,280 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
;;;
;;; This file contains the groundwork for vectorized
;;; array access intrinsics.
;;;
(in-package #:SSE)
;; SSE array element size calculation
(eval-when (:compile-toplevel :load-toplevel :execute)
(defun sse-elt-shift-from-saetp (info)
(and info
(subtypep (saetp-specifier info) 'number)
(not (saetp-fixnum-p info))
(case (saetp-n-bits info)
(8 0) (16 1) (32 2) (64 3) (128 4)))))
(defglobal %%size-shift-table%%
(let ((arr (make-array (1+ widetag-mask) :initial-element nil)))
(loop
for info across *specialized-array-element-type-properties*
for shift = (sse-elt-shift-from-saetp info)
when shift
do (setf (svref arr (saetp-typecode info)) shift))
arr)
"A table of element size shifts for supported SSE array types.")
(declaim (inline sse-elt-shift-of)
(ftype (function (t) (integer 0 4)) sse-elt-shift-of))
(defun sse-elt-shift-of (obj)
"Returns the SSE element size shift for the given object,
or fails if it is not a valid SSE vector."
(declare (optimize (safety 0)))
(the (integer 0 4)
(or (svref %%size-shift-table%%
(if (sb-vm::%other-pointer-p obj)
(%other-pointer-widetag obj)
0))
(error 'type-error
:datum obj
:expected-type 'sse-array))))
;;; Type and allocation
(deftype sse-array (&optional (elt-type '* et-p) dims)
"Type of arrays efficiently accessed by SSE aref intrinsics and returned by make-sse-array.
Should be assumed to be SIMPLE-ARRAY, except that displacing with MAKE-SSE-ARRAY is allowed."
(if (eq elt-type '*)
(progn
(when et-p
(error "SSE-ARRAY must have a specific element type."))
`(simple-array * ,dims))
(let* ((upgraded (upgraded-array-element-type elt-type))
(shift (sse-elt-shift-from-saetp (find-saetp upgraded))))
(when (null shift)
(error "Invalid SSE-ARRAY element type: ~S" elt-type))
(unless (subtypep upgraded elt-type)
(warn "SSE-ARRAY element type ~S has been upgraded to ~S" elt-type upgraded))
`(simple-array ,upgraded ,dims))))
(defun make-sse-array (dimensions &key (element-type '(unsigned-byte 8)) (initial-element nil ie-p) displaced-to (displaced-index-offset 0))
"Allocates an SSE-ARRAY aligned to the 16-byte boundary. Flattens displacement chains for performance reasons."
(let* ((upgraded (upgraded-array-element-type element-type))
(shift (sse-elt-shift-from-saetp (find-saetp upgraded))))
(when (null shift)
(error "Invalid SSE-ARRAY element type: ~S" element-type))
(if displaced-to
;; Fake displacement by allocating a simple-array header
(let* ((dimensions (if (listp dimensions) dimensions (list dimensions)))
(rank (length dimensions))
(count (reduce #'* dimensions)))
(unless (subtypep element-type (array-element-type displaced-to))
(error "can't displace an array of type ~S into another of type ~S"
element-type (array-element-type displaced-to)))
(with-array-data ((data displaced-to)
(start displaced-index-offset)
(end))
(unless (= start 0)
(error "SSE-ARRAY does not support displaced index offset."))
(unless (<= count end)
(array-bounding-indices-bad-error data start count))
(if (= rank 1)
(progn
(when (< count end)
(warn "SSE-ARRAY displaced size extended to the full length of the vector."))
data)
(let ((new-array (make-array-header simple-array-widetag rank)))
(set-array-header new-array data count nil 0 dimensions nil t)))))
;; X86-64 vectors are already aligned to 16 bytes
(apply #'make-array dimensions :element-type upgraded
(if ie-p (list :initial-element initial-element))))))
;;; AREF intrinsic definition helpers
(defconstant +vector-data-fixup+
(- (* vector-data-offset n-word-bytes) other-pointer-lowtag)
"Offset from a tagged vector pointer to its data")
(defmacro array-data-expr (array-var &optional is-vector)
(ecase is-vector
(:yes array-var)
(:no `(%array-data-vector ,array-var))
((nil)
`(if (array-header-p ,array-var)
(%array-data-vector ,array-var)
,array-var))))
;; Depends on the vector-length field being in the same place
;; as the array fill pointer, which for simple-array is equal
;; to the total size.
(defknown %sse-array-size (simple-array fixnum) array-total-size (flushable always-translatable))
(define-vop (%sse-array-size/0)
(:translate %sse-array-size)
(:args (array :scs (descriptor-reg)))
(:arg-types * (:constant (integer 0 0)))
(:info gap)
(:ignore gap)
(:policy :fast-safe)
(:results (result :scs (any-reg)))
(:result-types tagged-num)
(:generator 3
(loadw result array vector-length-slot other-pointer-lowtag)))
(define-vop (%sse-array-size %sse-array-size/0)
(:arg-types * (:constant (integer 1 16)))
(:ignore)
(:temporary (:sc any-reg) tmp)
(:generator 8
(loadw result array vector-length-slot other-pointer-lowtag)
(inst mov tmp (fixnumize gap))
(inst cmp result tmp)
(inst cmov :ng tmp result)
(inst sub result tmp)))
(defmacro with-sse-data (((sap-var data-var array) (offset-var index)) &body code)
;; Compute a SAP and offset for the specified array and index. Check bounds.
(with-unique-names (data-index data-end elt-shift access-size)
(once-only ((array array)
(index index))
`(locally
(declare (optimize (insert-array-bounds-checks 0)))
(with-array-data ((,data-var ,array)
(,data-index ,index)
(,data-end))
(let* ((,sap-var (int-sap (get-lisp-obj-address ,data-var)))
(,elt-shift (sse-elt-shift-of ,data-var))
(,access-size (ash 16 (- ,elt-shift)))
(,offset-var (+ (ash ,data-index ,elt-shift) +vector-data-fixup+)))
(declare (type system-area-pointer ,sap-var)
(type fixnum ,offset-var))
(unless (<= 0 ,data-index (+ ,data-index ,access-size) ,data-end)
(array-bounding-indices-bad-error ,array ,index (+ ,index ,access-size)))
,@code))))))
(defun sse-array-info-or-give-up (lvar)
;; Look up the SSE element size and check if it is definitely a vector
(let ((type (lvar-type lvar)))
(unless (and (array-type-p type)
(not (array-type-complexp type)))
(give-up-ir1-transform "not a simple array"))
(let* ((etype (array-type-specialized-element-type type))
(shift (sse-elt-shift-from-saetp
(if (eq etype *wild-type*) nil
(find-saetp-by-ctype etype)))))
(unless shift
(give-up-ir1-transform "not a known SSE-compatible array element type: ~S"
(type-specifier etype)))
(values (ash 1 shift) ; step
(1- (ash 16 (- shift))) ; gap
(and (listp (array-type-dimensions type))
(if (null (cdr (array-type-dimensions type))) :yes :no))))))
(defmacro def-aref-intrinsic (postfix rtype reader writer &key (check-bounds t))
(let* ((rm-aref (symbolicate "ROW-MAJOR-AREF-" postfix))
(rm-aset (if writer (symbolicate "ROW-MAJOR-ASET-" postfix)))
(aref (symbolicate "AREF-" postfix))
(aset (if writer (symbolicate "%ASET-" postfix)))
(reader-vop (symbolicate "%" reader))
(reader/ix-vop (symbolicate "%" reader "/IX"))
(writer-vop (if writer (symbolicate "%" writer)))
(writer/ix-vop (if writer (symbolicate "%" writer "/IX")))
(rtype (or rtype '(values)))
(index-expression
(ecase check-bounds
(t ``(the index (%check-bound array (%sse-array-size array ,gap) index)))
((nil) ``(the index index))
(:no-gap ``(the index (%check-bound array (%sse-array-size array 0) index))))))
`(progn
;; ROW-MAJOR-AREF
(export ',rm-aref)
(defknown ,rm-aref (array index) ,rtype (foldable flushable))
(defun ,rm-aref (array index)
(with-sse-data ((sap data array)
(offset index))
(,reader-vop sap offset)))
;;
(deftransform ,rm-aref ((array index) (simple-array t) * :important t)
,(format nil "open-code ~A" rm-aref)
(multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array)
(declare (ignorable gap))
`(,',reader/ix-vop (array-data-expr array ,is-vector)
,,index-expression
,step ,+vector-data-fixup+)))
;; AREF
(export ',aref)
(defknown ,aref (array &rest index) ,rtype (foldable flushable))
(defun ,aref (array &rest indices)
(declare (truly-dynamic-extent indices))
(with-sse-data ((sap data array)
(offset (%array-row-major-index array indices)))
(,reader-vop sap offset)))
;;
(defoptimizer (,aref derive-type) ((array &rest indices) node)
(assert-array-rank array (length indices))
(values-specifier-type ',rtype))
(deftransform ,aref ((array &rest indices) (simple-array &rest t) * :important t)
,(format nil "open-code ~A" aref)
(multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array)
(declare (ignorable gap))
(let ((syms (make-gensym-list (length indices))))
`(lambda (array ,@syms)
(let ((index ,(if (eq is-vector :yes) (first syms)
`(array-row-major-index array ,@syms))))
(,',reader/ix-vop (array-data-expr array ,is-vector)
,,index-expression
,step ,+vector-data-fixup+))))))
,@(if writer
`(;; ROW-MAJOR-ASET
(defknown ,rm-aset (array index sse-pack) ,rtype (unsafe))
(defsetf ,rm-aref ,rm-aset)
(defun ,rm-aset (array index new-value)
(with-sse-data ((sap data array)
(offset index))
(,writer-vop sap offset (the ,rtype new-value))
new-value))
;;
(deftransform ,rm-aset ((array index value) (simple-array t t) * :important t)
,(format nil "open-code ~A" rm-aset)
(multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array)
(declare (ignorable gap))
`(progn
(,',writer/ix-vop (array-data-expr array ,is-vector)
,,index-expression
,step ,+vector-data-fixup+
(the sse-pack value))
value)))
;; %ASET
(defknown ,aset (array &rest t) ,rtype (unsafe))
(defsetf ,aref ,aset)
(defun ,aset (array &rest stuff)
(let ((new-value (car (last stuff))))
(with-sse-data ((sap data array)
(offset (%array-row-major-index array (nbutlast stuff))))
(,writer-vop sap offset (the ,rtype new-value))
new-value)))
;;
(defoptimizer (,aset derive-type) ((array &rest stuff) node)
(assert-array-rank array (1- (length stuff)))
(assert-lvar-type (car (last stuff)) (specifier-type 'sse-pack)
(lexenv-policy (node-lexenv node)))
(specifier-type ',rtype))
(deftransform ,aset ((array &rest stuff) (simple-array &rest t) * :important t)
,(format nil "open-code ~A" aset)
(multiple-value-bind (step gap is-vector) (sse-array-info-or-give-up array)
(declare (ignorable gap))
(let ((syms (make-gensym-list (length stuff))))
`(lambda (array ,@syms)
(let ((index (the index ,(if (eq is-vector :yes) (first syms)
`(array-row-major-index array ,@(butlast syms))))))
(,',writer/ix-vop (array-data-expr array ,is-vector)
,,index-expression
,step ,+vector-data-fixup+
(the sse-pack ,(car (last syms)))))
,(car (last syms)))))))))))

View file

@ -0,0 +1,557 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
;;;
;;; This file contains definitions of abstract VOPs, macros
;;; and utility functions used to implement the intrinsics.
;;;
(in-package #:SSE)
;;; The specific pack types
(deftype int-sse-pack () '(sse-pack integer))
(deftype float-sse-pack () '(sse-pack single-float))
(deftype double-sse-pack () '(sse-pack double-float))
;;; Helper functions
(defconstant +uint32-mask+ #xFFFFFFFF)
(defconstant +uint64-mask+ #xFFFFFFFFFFFFFFFF)
(defconstant +min-int32+ (- (ash 1 31)))
(defconstant +max-int32+ (1- (ash 1 31)))
(defun type-name-to-primitive (lt)
(primitive-type-name (primitive-type (specifier-type lt))))
(defun move-cmd-for-type (lt)
(ecase lt
(int-sse-pack 'movdqa)
((float-sse-pack double-sse-pack) 'movaps)))
(defun ensure-reg-or-mem (tn)
(sc-case tn
((sse-pack-immediate immediate)
(register-inline-constant (tn-value tn)))
(t tn)))
(defmacro ensure-load (type tgt src)
`(unless (location= ,tgt ,src)
(inst ,(move-cmd-for-type type) ,tgt (ensure-reg-or-mem ,src))))
(defmacro ensure-move (type tgt src)
`(unless (location= ,tgt ,src)
(inst ,(move-cmd-for-type type) ,tgt ,src)))
(defmacro save-intrinsic-spec (name info)
`(eval-when (:compile-toplevel :load-toplevel :execute)
(setf (get ',name 'intrinsic-spec) ',info)))
(defmacro def-splice-transform (name args &body code)
(let* ((direct-args (mapcar (lambda (x) (if (consp x) (gensym) x)) args))
(flat-args (mapcan (lambda (x) (if (consp x) (copy-list (rest x)) (list x))) args)))
`(deftransform ,name ((,@direct-args) * *)
,(format nil "Simplify combination ~A" (cons name args))
,@(loop for spec in args and name in direct-args
when (consp spec)
collect `(splice-fun-args ,name ',(first spec) ,(1- (length spec))))
(list* 'lambda ',flat-args ',code))))
;;; Index-offset splicing
(defun fold-index-addressing (fun-name index scale &key setter-p)
(multiple-value-bind (func index-args) (extract-fun-args index '(+ - * ash) 2)
(destructuring-bind (x constant) index-args
(declare (ignorable x))
(unless (constant-lvar-p constant)
(give-up-ir1-transform))
(let ((value (lvar-value constant))
(scale-value (lvar-value scale)))
(case func
(* (unless (typep (* value scale-value) '(signed-byte 32))
(give-up-ir1-transform "constant is too large for inlining")))
(ash (unless (and (>= value 0)
(typep (ash scale-value value) '(signed-byte 32)))
(give-up-ir1-transform "index shift is unsuitable for inlining"))))
(splice-fun-args index func 2)
(let* ((value-arg (when setter-p '(value)))
(is-scale (member func '(* ash)))
(new-scale (if is-scale `(,func scale const) 'scale))
(new-offset (if is-scale 'offset `(,func offset (* const scale)))))
`(lambda (thing index const scale offset ,@value-arg)
(,fun-name thing index ,new-scale ,new-offset ,@value-arg)))))))
;;; Index-offset addressing
(defun is-tagged-load-scale (value)
(not (logtest value (1- (ash 1 n-fixnum-tag-bits)))))
(deftype tagged-load-scale ()
'(and fixnum (satisfies is-tagged-load-scale)))
(defun find-lea-scale (scale)
(cond ((not (logtest scale 7)) (values (/ scale 8) 8))
((not (logtest scale 3)) (values (/ scale 4) 4))
((not (logtest scale 1)) (values (/ scale 2) 2))
(t (values scale 1))))
(defun reduce-offset (ioffset scale offset)
"Redistribute value from ioffset to offset, while keeping offset int32."
(let* ((istep (if (< ioffset 0) -1 1))
(icount (max 0
(if (< ioffset 0)
(- (1+ +min-int32+) ioffset) ; = (- +max-int32+)
(- ioffset +max-int32+))))
(ostep (* istep scale))
(ocount (truncate (- (if (> ostep 0) +max-int32+ +min-int32+) offset)
ostep))
(count (min ocount icount)))
(values (- ioffset (* count istep))
(+ offset (* count ostep)))))
(defun split-offset (offset scale)
(if (typep offset '(signed-byte 32))
(values 0 offset)
(multiple-value-bind (div rem) (floor offset scale)
(assert (typep rem '(signed-byte 32)))
(if (typep div '(signed-byte 32))
(values div rem)
(reduce-offset div scale rem)))))
(defun power-of-2? (scale)
(and (> scale 0) (not (logtest scale (1- scale)))))
(defun find-power-of-2 (scale)
(assert (power-of-2? scale))
(loop for i from 0 and sv = scale then (ash sv -1)
when (<= sv 1) return i))
(defun make-scaled-ea (size sap index scale offset tmp &key fixnum-index)
"Returns an ea representing the given index*scale + offset formula.
May emit additional instructions using the temporary register."
(assemble ()
(if (or (sc-is index immediate) (= scale 0))
;; Fully constant offset
(let ((value (if (= scale 0) offset
(+ (* (tn-value index) scale) offset))))
(assert (typep value '(signed-byte 64)))
(if (typep value '(signed-byte 32))
(make-ea size :base sap :disp value)
(progn
(inst mov tmp (register-inline-constant value))
(make-ea size :base sap :index tmp))))
;; Indexing
(progn
(when (sc-is index any-reg)
(assert (and fixnum-index (is-tagged-load-scale scale)))
(setf scale (ash scale (- n-fixnum-tag-bits))))
(multiple-value-bind (rscale lscale) (find-lea-scale scale)
;; One-instruction case?
(if (and (= rscale 1) (typep offset '(signed-byte 32)))
(make-ea size :base sap :index index :scale scale :disp offset)
;; Use temporary
(multiple-value-bind (roffset loffset) (split-offset offset lscale)
(labels ((negate-when-<0 (register scale)
(when (< scale 0)
(inst neg register)))
(emit-shift-mul (register scale)
(inst shl register (find-power-of-2 (abs scale)))
(negate-when-<0 register scale))
;; Tries to compute tmp via LEA
(try-use-lea (scale &optional base)
(multiple-value-bind (rrscale rlscale) (find-lea-scale scale)
(when (and (= (abs rrscale) 1) (typep (* rrscale roffset) '(signed-byte 32)))
(when (and (= roffset 0) (null base)) ; minimize loffset
(multiple-value-setq (roffset loffset) (floor offset lscale)))
(let ((xoffset (* rrscale roffset)))
(inst lea tmp
(if (and (= rlscale 1) (null base))
(make-ea :byte :base index :disp xoffset)
(make-ea :byte :base base :index index :scale rlscale :disp xoffset))))
(negate-when-<0 tmp rrscale)
:success))))
(declare (inline negate-when-<0 emit-shift-mul))
(cond
;; same register shift?
((and (= roffset 0) (location= tmp index) (power-of-2? (abs rscale)))
(emit-shift-mul tmp rscale))
;; one LEA?
((try-use-lea rscale))
((try-use-lea (1- rscale) index))
;; Generic case, use mul/shl and add
(t
(if (power-of-2? (abs rscale))
(progn
(move tmp index)
(emit-shift-mul tmp rscale))
(inst imul tmp index rscale))
(unless (= roffset 0)
;; Make loffset as small as possible
(multiple-value-setq (roffset loffset) (floor offset lscale))
(if (typep roffset '(signed-byte 32))
(inst add tmp roffset)
(inst add tmp (register-inline-constant roffset))))))
(make-ea size :base sap :index tmp :scale lscale :disp loffset)))))))))
;; Initialization
(defmacro def-float-set-intrinsic (&whole whole pubname fname atype aregtype rtype move)
(declare (ignore pubname))
`(progn
(save-intrinsic-spec ,fname ,whole)
(defknown ,fname (,atype) ,rtype (foldable flushable))
(define-vop (,fname)
(:translate ,fname)
(:args (arg :scs (,aregtype) :target dst))
(:arg-types ,atype)
(:results (dst :scs (sse-reg)))
(:result-types ,(type-name-to-primitive rtype))
(:policy :fast-safe)
(:generator 1
(unless (location= dst arg)
(inst ,move dst arg))))))
;; Unary operations
(define-vop (sse-unary-base-op)
;; no immediate because expecting to be folded
(:args (x :scs (sse-reg)))
(:arg-types sse-pack)
(:policy :fast-safe)
(:note "inline SSE unary operation")
(:vop-var vop)
(:save-p :compute-only))
(define-vop (sse-unary-op sse-unary-base-op)
(:args (x :scs (sse-reg) :target r))
(:results (r :scs (sse-reg))))
(define-vop (sse-unary-to-int-op sse-unary-base-op)
(:results (r :scs (signed-reg))))
(define-vop (sse-unary-to-uint-op sse-unary-base-op)
(:results (r :scs (unsigned-reg))))
(defmacro def-unary-intrinsic (&whole whole name rtype insn cost c-name &key partial immediate-arg result-size arg-type)
(declare (ignore c-name arg-type))
(let* ((imm (if immediate-arg '(imm)))
(immt (if immediate-arg (list immediate-arg))))
(assert (or (not partial) (not (subtypep rtype 'integer))))
`(progn
(export ',name)
(save-intrinsic-spec ,name ,whole)
(defknown ,name (sse-pack ,@immt) ,rtype (foldable flushable))
(define-vop (,name ,(cond ((subtypep rtype 'unsigned-byte)
'sse-unary-to-uint-op)
((subtypep rtype 'integer)
'sse-unary-to-int-op)
(t 'sse-unary-op)))
(:translate ,name)
(:result-types ,(type-name-to-primitive rtype))
,@(if immediate-arg
`((:arg-types sse-pack (:constant ,immediate-arg))
(:info imm)))
(:generator ,cost
,@(ecase partial
(:one-arg `((ensure-move ,rtype r x)
(inst ,insn r ,@imm)))
(t `((ensure-move ,rtype r x)
(inst ,insn r r ,@imm)))
((nil) `((inst ,insn
,(if result-size `(reg-in-size r ,result-size) 'r)
x ,@imm)))))))))
;; Unary to int32 & sign-extend
(define-vop (sse-cvt-to-int32-op sse-unary-base-op)
(:temporary (:sc signed-reg :offset rax-offset :target r :to :result) rax)
(:results (r :scs (signed-reg))))
(defmacro def-cvt-to-int32-intrinsic (name rtype insn cost c-name &key arg-type)
(declare (ignore arg-type))
`(progn
(export ',name)
(save-intrinsic-spec ,name (def-unary-intrinsic ,name ,rtype ,insn ,cost ,c-name))
(defknown ,name (sse-pack) (signed-byte 32) (foldable flushable))
(define-vop (,name sse-cvt-to-int32-op)
(:translate ,name)
(:result-types ,(type-name-to-primitive rtype))
(:generator ,cost
(inst ,insn (reg-in-size rax :dword) x)
(inst cdqe)
(move r rax)))))
;; NOT intrinsics
(define-vop (sse-not-op sse-unary-op)
(:temporary (:sc sse-reg) tmp))
(defmacro def-not-intrinsic (name rtype insn)
`(progn
(export ',name)
(save-intrinsic-spec ,name (def-unary-intrinsic ,name ,rtype ,insn 3 nil))
(defknown ,name (sse-pack) ,rtype (foldable flushable))
(define-vop (,name sse-not-op)
(:translate ,name)
(:result-types ,(type-name-to-primitive rtype))
(:generator 3
(if (location= x r)
(progn
(inst pcmpeqd tmp tmp)
(inst ,insn r tmp))
(progn
(inst pcmpeqd r r)
(inst ,insn r x)))))))
;; Binary operations
(define-vop (sse-binary-base-op)
(:args (x :scs (sse-reg sse-pack-immediate) :target r)
(y :scs (sse-reg sse-pack-immediate)))
(:results (r :scs (sse-reg)))
(:arg-types sse-pack sse-pack)
(:policy :fast-safe)
(:note "inline SSE binary operation")
(:vop-var vop)
(:save-p :compute-only))
(define-vop (sse-binary-op sse-binary-base-op)
(:temporary (:sc sse-reg) tmp))
(define-vop (sse-binary-comm-op sse-binary-base-op)
(:args (x :scs (sse-reg sse-pack-immediate) :target r)
(y :scs (sse-reg sse-pack-immediate) :target r)))
(defmacro def-binary-intrinsic (&whole whole name rtype insn cost c-name &key commutative tags immediate-arg x-type y-type)
(declare (ignore c-name x-type y-type))
(let* ((imm (if immediate-arg '(imm)))
(immt (if immediate-arg (list immediate-arg))))
`(progn
(export ',name)
(save-intrinsic-spec ,name ,whole)
(defknown ,name (sse-pack sse-pack ,@immt) ,rtype (foldable flushable))
(define-vop (,name ,(if commutative 'sse-binary-comm-op 'sse-binary-op))
(:translate ,name)
(:result-types ,(type-name-to-primitive rtype))
,@(if immediate-arg
`((:arg-types sse-pack sse-pack (:constant ,immediate-arg))
(:info imm)))
(:generator ,cost
,@(if commutative
`((when (location= y r)
(rotatef x y))
(ensure-load ,rtype r x)
(inst ,insn ,@tags r (ensure-reg-or-mem y) ,@imm))
`((unless (location= y r)
(setf tmp r))
(ensure-load ,rtype tmp x)
(inst ,insn ,@tags tmp (ensure-reg-or-mem y) ,@imm)
(ensure-move ,rtype r tmp))))))))
;;; XMM/Integer combination intrinsics
(define-vop (sse-int-base-op)
(:results (r :scs (sse-reg)))
(:policy :fast-safe)
(:note "inline SSE/integer operation")
(:vop-var vop)
(:save-p :compute-only))
(define-vop (sse-int-op sse-int-base-op)
(:args (x :scs (sse-reg sse-pack-immediate) :target r)
(iv :scs (signed-reg signed-stack immediate)))
(:arg-types sse-pack signed-num))
(define-vop (sse-uint-op sse-int-base-op)
(:args (x :scs (sse-reg sse-pack-immediate) :target r)
(iv :scs (unsigned-reg unsigned-stack immediate)))
(:arg-types sse-pack unsigned-num))
(defmacro def-sse-int-intrinsic (&whole whole name itype rtype insn cost c-name &key make-temporary immediate-arg)
(declare (ignore c-name))
(let* ((imm (if immediate-arg '(imm)))
(immt (if immediate-arg (list immediate-arg)))
(unsigned? (subtypep itype 'unsigned-byte)))
`(progn
(export ',name)
(save-intrinsic-spec ,name ,whole)
(defknown ,name (sse-pack ,itype ,@immt) ,rtype (foldable flushable))
(define-vop (,name ,(if unsigned? 'sse-uint-op 'sse-int-op))
(:translate ,name)
(:result-types ,(type-name-to-primitive rtype))
,@(if immediate-arg
`((:arg-types sse-pack
,(if unsigned? 'unsigned-num 'signed-num)
(:constant ,immediate-arg))
(:info imm)))
,@(if make-temporary
`((:temporary (:sc sse-reg) tmp)))
(:generator ,cost
(ensure-load ,rtype r x)
,@(if (eq make-temporary t)
'((inst movd tmp (ensure-reg-or-mem iv)))
make-temporary)
(inst ,insn r ,(if make-temporary 'tmp '(ensure-reg-or-mem iv)) ,@imm))))))
;;; Memory intrinsics
(define-vop (sse-load-base-op)
(:results (r :scs (sse-reg)))
(:policy :fast-safe)
(:note "inline SSE load operation"))
(define-vop (sse-load-op sse-load-base-op)
(:args (sap :scs (sap-reg))
(offset :scs (signed-reg)))
(:arg-types system-area-pointer signed-num))
(define-vop (sse-xmm-load-op sse-load-base-op)
(:args (value :scs (sse-reg sse-pack-immediate) :target r)
(sap :scs (sap-reg))
(offset :scs (signed-reg)))
(:arg-types sse-pack system-area-pointer signed-num))
(define-vop (sse-load-imm-op sse-load-base-op)
(:args (sap :scs (sap-reg)))
(:arg-types system-area-pointer
(:constant (signed-byte 32)))
(:info offset))
(define-vop (sse-xmm-load-imm-op sse-load-base-op)
(:args (value :scs (sse-reg sse-pack-immediate) :target r)
(sap :scs (sap-reg)))
(:arg-types sse-pack system-area-pointer
(:constant (signed-byte 32)))
(:info offset))
(define-vop (sse-load-ix-op sse-load-base-op)
(:args (sap :scs (descriptor-reg) :to :eval)
(index :scs (signed-reg immediate) :target tmp))
(:arg-types * signed-num (:constant fixnum) (:constant fixnum))
(:temporary (:sc signed-reg :from (:argument 1)) tmp)
(:info scale offset))
(define-vop (sse-load-ix-op/tag sse-load-base-op)
(:args (sap :scs (descriptor-reg) :to :eval)
(index :scs (any-reg signed-reg immediate) :target tmp))
(:arg-types * tagged-num (:constant tagged-load-scale) (:constant fixnum))
(:temporary (:sc any-reg :from (:argument 1)) tmp)
(:info scale offset))
(defmacro def-load-intrinsic (&whole whole name rtype insn c-name
&key register-arg tags postfix-fmt (size :qword))
(declare (ignore c-name postfix-fmt))
(let* ((vop (symbolicate "%" name))
(c-vop (symbolicate vop "-C"))
(ix-vop (symbolicate vop "/IX"))
(valtype (if register-arg '(sse-pack)))
(valarg (if register-arg '(value)))
(r-arg (if rtype '(r)))
(rtypes (if rtype
`(:result-types ,(type-name-to-primitive rtype))
`(:results))))
(assert (or rtype (not register-arg)))
`(progn
(export ',name)
(save-intrinsic-spec ,name ,whole)
(defknown ,vop (,@valtype system-area-pointer fixnum) ,(or rtype '(values)) (flushable always-translatable))
(define-vop (,vop ,(if register-arg 'sse-xmm-load-op 'sse-load-op))
(:translate ,vop)
,rtypes
(:generator 5
,(if register-arg `(ensure-load ,rtype r value))
(inst ,insn ,@tags ,@r-arg (make-ea ,size :base sap :index offset))))
(define-vop (,c-vop ,(if register-arg 'sse-xmm-load-imm-op 'sse-load-imm-op))
(:translate ,vop)
,rtypes
(:generator 4
,(if register-arg `(ensure-load ,rtype r value))
(inst ,insn ,@tags ,@r-arg (make-ea ,size :base sap :disp offset))))
(def-splice-transform ,vop (,@valarg (sap+ sap offset1) offset2)
(,vop ,@valarg sap (+ offset1 offset2)))
,@(if (null register-arg)
`(;; Vector indexing version
(defknown ,ix-vop (simple-array fixnum fixnum fixnum) ,(or rtype '(values))
(flushable always-translatable))
(define-vop (,ix-vop sse-load-ix-op)
(:translate ,ix-vop)
,rtypes
(:generator 4
(inst ,insn ,@tags ,@r-arg (make-scaled-ea ,size sap index scale offset tmp))))
(define-vop (,(symbolicate ix-vop "/TAG") sse-load-ix-op/tag)
(:translate ,ix-vop)
,rtypes
(:generator 3
(inst ,insn ,@tags ,@r-arg (make-scaled-ea ,size sap index scale offset tmp :fixnum-index t))))
(deftransform ,ix-vop ((thing index scale offset))
"fold semi-constant index expressions"
(fold-index-addressing ',ix-vop index scale)))))))
(define-vop (sse-store-base-op)
(:policy :fast-safe)
(:note "inline SSE store operation"))
(define-vop (sse-store-op sse-store-base-op)
(:args (sap :scs (sap-reg))
(offset :scs (signed-reg))
(value :scs (sse-reg)))
(:arg-types system-area-pointer signed-num sse-pack))
(define-vop (sse-store-imm-op sse-store-base-op)
(:args (sap :scs (sap-reg))
(value :scs (sse-reg)))
(:arg-types system-area-pointer
(:constant (signed-byte 32))
sse-pack)
(:info offset))
(define-vop (sse-store-ix-op sse-store-base-op)
(:args (sap :scs (descriptor-reg) :to :eval)
(index :scs (signed-reg immediate) :target tmp)
(value :scs (sse-reg)))
(:arg-types * signed-num (:constant fixnum) (:constant fixnum) sse-pack)
(:temporary (:sc signed-reg :from (:argument 1)) tmp)
(:info scale offset))
(define-vop (sse-store-ix-op/tag sse-store-base-op)
(:args (sap :scs (descriptor-reg) :to :eval)
(index :scs (any-reg signed-reg immediate) :target tmp)
(value :scs (sse-reg)))
(:arg-types * tagged-num (:constant tagged-load-scale) (:constant fixnum) sse-pack)
(:temporary (:sc any-reg :from (:argument 1)) tmp)
(:info scale offset))
(defmacro def-store-intrinsic (&whole whole name rtype insn c-name &key setf-name)
(declare (ignore rtype c-name))
(let* ((vop (symbolicate "%" name))
(c-vop (symbolicate vop "-C"))
(ix-vop (symbolicate vop "/IX")))
`(progn
,(unless setf-name `(export ',name))
(save-intrinsic-spec ,name ,whole)
(defknown ,vop (system-area-pointer fixnum sse-pack) (values) (unsafe always-translatable))
(define-vop (,vop sse-store-op)
(:translate ,vop)
(:generator 5
(inst ,insn (make-ea :qword :base sap :index offset) value)))
(define-vop (,c-vop sse-store-imm-op)
(:translate ,vop)
(:generator 4
(inst ,insn (make-ea :qword :base sap :disp offset) value)))
(def-splice-transform ,vop ((sap+ sap offset1) offset2 new-value)
(,vop sap (+ offset1 offset2) new-value))
;; Vector indexing version
(defknown ,ix-vop (simple-array fixnum fixnum fixnum sse-pack) (values)
(unsafe always-translatable))
(define-vop (,ix-vop sse-store-ix-op)
(:translate ,ix-vop)
(:generator 4
(inst ,insn (make-scaled-ea :qword sap index scale offset tmp) value)))
(define-vop (,(symbolicate ix-vop "/TAG") sse-store-ix-op/tag)
(:translate ,ix-vop)
(:generator 3
(inst ,insn (make-scaled-ea :qword sap index scale offset tmp :fixnum-index t) value)))
(deftransform ,ix-vop ((thing index scale offset value))
"fold semi-constant index expressions"
(fold-index-addressing ',ix-vop index scale :setter-p t)))))

View file

@ -0,0 +1,454 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
;;;
;;; This file implements VOP-wrapping functions and non-primitive
;;; extensions to the core intrinsic set.
;;;
(in-package #:SSE)
;;; Materialize the intrinsic functions.
;; Since VOPs are activated only on load, actual functions that
;; wrap them have to be defined in a different file. This is a
;; hack to generate the functions from the same macro invocations
;; as the VOPS.
(macrolet ((def-float-set-intrinsic (pubname fname atype aregtype rtype move)
(declare (ignore aregtype move))
`(progn
(defun ,fname (arg)
(declare (type ,atype arg))
(truly-the ,rtype (%primitive ,fname arg)))
;; Public function - includes coercion
(export ',pubname)
(declaim (ftype (function (real) ,rtype) ,pubname)
(inline ,pubname))
(defun ,pubname (arg) (,fname (coerce arg ',atype)))))
(def-unary-intrinsic (name rtype insn cost c-name &key immediate-arg &allow-other-keys)
(declare (ignore insn cost c-name))
(unless immediate-arg
`(defun ,name (x)
(declare (type sse-pack x))
(truly-the ,rtype (%primitive ,name x)))))
(def-binary-intrinsic (name rtype insn cost c-name &key immediate-arg &allow-other-keys)
(declare (ignore insn cost c-name))
(unless immediate-arg
`(defun ,name (x y ,@(if immediate-arg '(imm)))
(declare (type sse-pack x y))
(truly-the ,rtype (%primitive ,name x y)))))
(def-sse-int-intrinsic (name itype rtype insn cost c-name &key immediate-arg &allow-other-keys)
(declare (ignore insn cost c-name))
(unless immediate-arg
`(defun ,name (x iv)
(declare (type sse-pack x)
(type ,itype iv))
(truly-the ,rtype (%primitive ,name x iv)))))
(def-load-intrinsic (name rtype insn c-name &key register-arg &allow-other-keys)
(declare (ignore insn c-name))
(let* ((vop (symbolicate "%" name))
(valarg (if register-arg '(value))))
`(progn
(declaim (inline ,name))
(defun ,name (,@valarg pointer &optional (offset 0))
(declare ,@(if register-arg '((type sse-pack value)))
(type system-area-pointer pointer)
(type fixnum offset))
,(if rtype
`(truly-the ,rtype (,vop ,@valarg pointer offset))
`(,vop ,@valarg pointer offset))))))
(def-store-intrinsic (name rtype insn c-name &key setf-name &allow-other-keys)
(declare (ignore insn c-name))
(let* ((vop (symbolicate "%" name)))
`(progn
(declaim (inline ,name))
(defun ,name (pointer value &optional (offset 0))
(declare (type system-area-pointer pointer)
(type sse-pack value)
(type fixnum offset))
(,vop pointer offset value)
(truly-the ,rtype value))
,(if setf-name
`(defsetf ,setf-name (pointer &optional (offset 0)) (value)
`(,',name ,pointer ,value ,offset)))))))
;; Load the definition list
#.(loop for name being each present-symbol
when (get name 'intrinsic-spec)
collect it into specs
finally (return `(progn ,@specs))))
;;; Helper functions and macros
(defmacro def-utility (name args rtype &body code)
`(progn
(export ',name)
(declaim (ftype (function ,(mapcar (constantly 'sse-pack) args) ,rtype) ,name)
(inline ,name))
(defun ,name ,args ,@code)))
(defmacro def-if-function (name rtype postfix)
(let* ((not-x (symbolicate "NOT-" postfix))
(or-x (symbolicate "OR-" postfix))
(and-x (symbolicate "AND-" postfix))
(andn-x (symbolicate "ANDNOT-" postfix))
(xor-x (symbolicate "XOR-" postfix))
(true (%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF))
(false (%make-sse-pack 0 0)))
`(progn
(export ',name)
(defknown ,name (sse-pack sse-pack sse-pack) ,rtype (foldable flushable))
(defun ,name (condition true-val false-val)
(,or-x (,and-x condition true-val)
(,andn-x condition false-val)))
;; Instead of inlining, use a transform so that the splice
;; rule has a chance to apply. This depends on transform
;; definitions behaving like a LIFO:
(deftransform ,name ((condition true-val false-val) * *)
"Expand the conditional."
'(,or-x (,and-x condition true-val) (,andn-x condition false-val)))
(def-splice-transform ,name ((,not-x cond) tv fv) (,name cond fv tv))
;; NOT elimination and partial constant folding for bitwise ops:
(def-splice-transform ,not-x ((,not-x arg1)) arg1)
(def-splice-transform ,and-x (arg1 (,not-x arg2)) (,andn-x arg2 arg1))
(def-splice-transform ,and-x ((,not-x arg1) arg2) (,andn-x arg1 arg2))
(def-splice-transform ,andn-x ((,not-x arg1) arg2) (,and-x arg1 arg2))
(%deftransform ',or-x '(function * *) #'commutative-arg-swap "place constant arg last")
(%deftransform ',and-x '(function * *) #'commutative-arg-swap "place constant arg last")
(%deftransform ',xor-x '(function * *) #'commutative-arg-swap "place constant arg last")
(deftransform ,or-x ((arg1 arg2) (* (constant-arg (member ,true))) *) ,true)
(deftransform ,or-x ((arg1 arg2) (* (constant-arg (member ,false))) *) 'arg1)
(deftransform ,and-x ((arg1 arg2) (* (constant-arg (member ,true))) *) 'arg1)
(deftransform ,and-x ((arg1 arg2) (* (constant-arg (member ,false))) *) ,false)
(deftransform ,xor-x ((arg1 arg2) (* (constant-arg (member ,false))) *) 'arg1)
(deftransform ,andn-x ((arg1 arg2) (* (constant-arg (member ,true))) *) 'arg1)
(deftransform ,andn-x ((arg1 arg2) (* (constant-arg (member ,false))) *) ,false)
(deftransform ,andn-x ((arg1 arg2) ((constant-arg (member ,true)) *) *) ,false)
(deftransform ,andn-x ((arg1 arg2) ((constant-arg (member ,false)) *) *) 'arg2))))
(defmacro def-not-cmp-pairs (not-fun &rest pairs)
`(progn
,@(loop for (a b) on pairs by #'cddr
collect `(def-splice-transform ,not-fun ((,a arg1 arg2)) (,b arg1 arg2))
collect `(def-splice-transform ,not-fun ((,b arg1 arg2)) (,a arg1 arg2)))))
;;; CPU control
(defun cpu-mxcsr ()
(cpu-mxcsr))
(defun %set-cpu-mxcsr (x)
(declare (type (unsigned-byte 32) x))
(%set-cpu-mxcsr x))
(defsetf cpu-mxcsr %set-cpu-mxcsr)
(defun cpu-load-fence () (cpu-load-fence))
(defun cpu-store-fence () (cpu-store-fence))
(defun cpu-memory-fence () (cpu-memory-fence))
(defun cpu-pause () (cpu-pause))
;;; Single-float
;; Constants
(define-symbol-macro 0.0-ps (truly-the float-sse-pack #.(%make-sse-pack 0 0)))
(define-symbol-macro true-ss (truly-the float-sse-pack #.(%make-sse-pack #xFFFFFFFF 0)))
(define-symbol-macro true-ps (truly-the float-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF)))
(define-symbol-macro false-ss (truly-the float-sse-pack #.(%make-sse-pack 0 0)))
(define-symbol-macro false-ps (truly-the float-sse-pack #.(%make-sse-pack 0 0)))
;; Initialization
(declaim (inline set1-ps set-ps setr-ps setzero-ps))
(defun set1-ps (val)
(let ((valv (set-ss val)))
(shuffle-ps valv valv 0)))
(defun set-ps (x3 x2 x1 x0)
(movelh-ps (unpacklo-ps (set-ss x0) (set-ss x1))
(unpacklo-ps (set-ss x2) (set-ss x3))))
(defun setr-ps (x0 x1 x2 x3)
(movelh-ps (unpacklo-ps (set-ss x0) (set-ss x1))
(unpacklo-ps (set-ss x2) (set-ss x3))))
(defun setzero-ps () 0.0-ps)
;; Arithmetic negation
(def-utility neg-ss (arg) float-sse-pack
(xor-ps arg #.(%make-sse-pack #x80000000 0)))
(def-utility neg-ps (arg) float-sse-pack
(xor-ps arg #.(%make-sse-pack #x8000000080000000 #x8000000080000000)))
;; Bitwise operations
(def-if-function if-ps float-sse-pack #:ps)
;; Comparisons
(def-utility >-ss (x y) float-sse-pack (<-ss y x))
(def-utility >-ps (x y) float-sse-pack (<-ps y x))
(def-utility >=-ss (x y) float-sse-pack (<=-ss y x))
(def-utility >=-ps (x y) float-sse-pack (<=-ps y x))
(def-utility />-ss (x y) float-sse-pack (/<-ss y x))
(def-utility />-ps (x y) float-sse-pack (/<-ps y x))
(def-utility />=-ss (x y) float-sse-pack (/<=-ss y x))
(def-utility />=-ps (x y) float-sse-pack (/<=-ps y x))
(def-not-cmp-pairs not-ps
=-ps /=-ps <-ps /<-ps <=-ps /<=-ps >-ps />-ps >=-ps />=-ps cmpord-ps cmpunord-ps)
;; Shuffle
(declaim (inline %sse-pack-to-int %int-to-sse-pack %shuffle-subints))
(defun %sse-pack-to-int (pack)
(logior (%sse-pack-low pack) (ash (%sse-pack-high pack) 64)))
(defun %int-to-sse-pack (val &aux (mask #xFFFFFFFFFFFFFFFF))
(%make-sse-pack (logand val mask) (logand (ash val -64) mask)))
(defun %shuffle-subints (xval yval imm bit-cnt &aux (mask (1- (ash 1 bit-cnt))))
(flet ((bits (idx)
(logand 3 (ash imm (* -2 idx))))
(val (src idx)
(logand mask (ash src (* (- bit-cnt) idx)))))
(logior (val xval (bits 0))
(ash (val xval (bits 1)) bit-cnt)
(ash (val yval (bits 2)) (* 2 bit-cnt))
(ash (val yval (bits 3)) (* 3 bit-cnt)))))
(defun shuffle-ps (x y imm)
(declare (type sse-pack x y))
(let* ((xval (%sse-pack-to-int x))
(yval (%sse-pack-to-int y)))
(truly-the float-sse-pack (%int-to-sse-pack (%shuffle-subints xval yval imm 32)))))
;;; Double-float
;; Constants
(define-symbol-macro 0.0-pd (truly-the double-sse-pack #.(%make-sse-pack 0 0)))
(define-symbol-macro true-sd (truly-the double-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF 0)))
(define-symbol-macro true-pd (truly-the double-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF)))
(define-symbol-macro false-sd (truly-the double-sse-pack #.(%make-sse-pack 0 0)))
(define-symbol-macro false-pd (truly-the double-sse-pack #.(%make-sse-pack 0 0)))
;; Initialization
(declaim (inline set1-pd set-pd setr-pd setzero-pd))
(defun set1-pd (val)
(let ((valv (set-sd val)))
(shuffle-pd valv valv 0)))
(defun set-pd (x1 x0)
(unpacklo-pd (set-sd x0) (set-sd x1)))
(defun setr-pd (x0 x1)
(unpacklo-pd (set-sd x0) (set-sd x1)))
(defun setzero-pd () 0.0-pd)
;; Arithmetic negation
(def-utility neg-sd (arg) double-sse-pack
(xor-pd arg #.(%make-sse-pack #x8000000000000000 0)))
(def-utility neg-pd (arg) double-sse-pack
(xor-pd arg #.(%make-sse-pack #x8000000000000000 #x8000000000000000)))
;; Bitwise operations
(def-if-function if-pd double-sse-pack #:pd)
;; Comparisons
(def-utility >-sd (x y) double-sse-pack (<-sd y x))
(def-utility >-pd (x y) double-sse-pack (<-pd y x))
(def-utility >=-sd (x y) double-sse-pack (<=-sd y x))
(def-utility >=-pd (x y) double-sse-pack (<=-pd y x))
(def-utility />-sd (x y) double-sse-pack (/<-sd y x))
(def-utility />-pd (x y) double-sse-pack (/<-pd y x))
(def-utility />=-sd (x y) double-sse-pack (/<=-sd y x))
(def-utility />=-pd (x y) double-sse-pack (/<=-pd y x))
(def-not-cmp-pairs not-pd
=-pd /=-pd <-pd /<-pd <=-pd /<=-pd >-pd />-pd >=-pd />=-pd cmpord-pd cmpunord-pd)
;; Shuffle
(defun shuffle-pd (x y imm)
(declare (type sse-pack x y))
(truly-the double-sse-pack
(%make-sse-pack (if (logtest imm 1) (%sse-pack-high x) (%sse-pack-low x))
(if (logtest imm 2) (%sse-pack-high y) (%sse-pack-low y)))))
;;; Integer
;; Constants
(define-symbol-macro 0-pi (truly-the int-sse-pack #.(%make-sse-pack 0 0)))
(define-symbol-macro true-pi (truly-the int-sse-pack #.(%make-sse-pack #xFFFFFFFFFFFFFFFF #xFFFFFFFFFFFFFFFF)))
(define-symbol-macro false-pi (truly-the int-sse-pack #.(%make-sse-pack 0 0)))
;; Initialization
(macrolet ((defset (name type)
`(defun ,name (x)
(declare (type ,type x))
(,name x))))
(defset %set-int (signed-byte 64))
(defset %set-uint (unsigned-byte 64))
(defset convert-si32-to-pi (signed-byte 32))
(defset convert-su32-to-pi (unsigned-byte 32))
(defset convert-si64-to-pi (signed-byte 64))
(defset convert-su64-to-pi (unsigned-byte 64)))
(macrolet ((defset1 (name setter type shuffle &rest expands)
`(progn
(export ',name)
(declaim (inline ,name))
(defun ,name (arg)
(let ((val (,setter (the ,type arg))))
(declare (type int-sse-pack val))
,@(loop for x in expands collect `(setq val (,x val val)))
(shuffle-pi32 val ,shuffle))))))
(defset1 set1-pi8 %set-int fixnum #4r0000 unpacklo-pi8 unpacklo-pi16)
(defset1 set1-pi16 %set-int fixnum #4r0000 unpacklo-pi16)
(defset1 set1-pi32 %set-int (signed-byte 32) #4r0000)
(defset1 set1-pu32 %set-uint (unsigned-byte 32) #4r0000)
(defset1 set1-pi64 %set-int (signed-byte 64) #4r1010)
(defset1 set1-pu64 %set-uint (unsigned-byte 64) #4r1010))
(macrolet ((defset (name rname setter type depth)
(let* ((names (loop for i from 0 below (ash 1 depth)
collect (symbolicate (format nil "X~A" i))))
(funcs #(unpacklo-pi64 unpacklo-pi32 unpacklo-pi16 unpacklo-pi8))
(body (loop for i downfrom depth to 0
for bv = (mapcar (lambda (x) `(,setter (the ,type ,x))) names)
then (loop for (a b) on bv by #'cddr
collect `(,(svref funcs i) ,a ,b))
finally (return (first bv)))))
`(progn
(export ',name)
(export ',rname)
(declaim (inline ,name ,rname))
(defun ,name (,@(reverse names)) ,body)
(defun ,rname (,@names) ,body)))))
(defset set-pi8 setr-pi8 %set-int fixnum 4)
(defset set-pi16 setr-pi16 %set-int fixnum 3)
(defset set-pi32 setr-pi32 %set-int (signed-byte 32) 2)
(defset set-pu32 setr-pu32 %set-uint (unsigned-byte 32) 2)
(defset set-pi64 setr-pi64 %set-int (signed-byte 64) 1)
(defset set-pu64 setr-pu64 %set-uint (unsigned-byte 64) 1))
(declaim (inline setzero-pi))
(defun setzero-pi () 0-pi)
;; Masked move
(export 'maskmoveu-pi)
(declaim (inline maskmoveu-pi))
(defun maskmoveu-pi (value mask pointer &optional (offset 0))
(declare (type sse-pack value mask)
(type system-area-pointer pointer)
(type fixnum offset))
(%maskmoveu-pi value mask pointer offset))
;; Arithmetic negation (subtract from 0)
(macrolet ((frob (name subf)
`(def-utility ,name (arg) int-sse-pack (,subf 0-pi arg))))
(frob neg-pi8 sub-pi8)
(frob neg-pi16 sub-pi16)
(frob neg-pi32 sub-pi32)
(frob neg-pi64 sub-pi64))
;; Bitwise operations
(def-if-function if-pi int-sse-pack #:pi)
;; Comparisons
(def-utility <-pi8 (x y) int-sse-pack (>-pi8 y x))
(def-utility <-pi16 (x y) int-sse-pack (>-pi16 y x))
(def-utility <-pi32 (x y) int-sse-pack (>-pi32 y x))
(def-utility <=-pi8 (x y) int-sse-pack (not-pi (>-pi8 x y)))
(def-utility <=-pi16 (x y) int-sse-pack (not-pi (>-pi16 x y)))
(def-utility <=-pi32 (x y) int-sse-pack (not-pi (>-pi32 x y)))
(def-utility >=-pi8 (x y) int-sse-pack (not-pi (>-pi8 y x)))
(def-utility >=-pi16 (x y) int-sse-pack (not-pi (>-pi16 y x)))
(def-utility >=-pi32 (x y) int-sse-pack (not-pi (>-pi32 y x)))
(def-utility /=-pi8 (x y) int-sse-pack (not-pi (=-pi8 x y)))
(def-utility /=-pi16 (x y) int-sse-pack (not-pi (=-pi16 x y)))
(def-utility /=-pi32 (x y) int-sse-pack (not-pi (=-pi32 x y)))
;; Shifts
(defun slli-pi (x imm)
(declare (type sse-pack x))
(truly-the int-sse-pack
(if (> imm 15)
0-pi
(%int-to-sse-pack (ash (%sse-pack-to-int x) (* 8 imm))))))
(defun srli-pi (x imm)
(declare (type sse-pack x))
(truly-the int-sse-pack
(if (> imm 15)
0-pi
(%int-to-sse-pack (ash (%sse-pack-to-int x) (* -8 imm))))))
;; Extract & insert
(defun extract-pi16 (x imm)
(declare (type sse-pack x))
(logand #xFFFF
(ash (%sse-pack-to-int x)
(- (* 16 (logand imm 7))))))
(defun insert-pi16 (x intv imm)
(declare (type sse-pack x))
(let ((shift (* 16 (logand imm 7))))
(truly-the int-sse-pack
(%int-to-sse-pack
(logior (logand (%sse-pack-to-int x)
(lognot (ash #xFFFF shift)))
(ash (logand intv #xFFFF) shift))))))
;; Shuffle
(defun shuffle-pi32 (x imm)
(declare (type sse-pack x))
(let* ((xval (%sse-pack-to-int x)))
(truly-the int-sse-pack (%int-to-sse-pack (%shuffle-subints xval xval imm 32)))))
(defun shufflelo-pi16 (x imm)
(declare (type sse-pack x))
(let* ((xval (%sse-pack-low x)))
(truly-the int-sse-pack (%make-sse-pack (%shuffle-subints xval xval imm 16)
(%sse-pack-high x)))))
(defun shufflehi-pi16 (x imm)
(declare (type sse-pack x))
(let* ((xval (%sse-pack-high x)))
(truly-the int-sse-pack (%make-sse-pack (%sse-pack-low x)
(%shuffle-subints xval xval imm 16)))))

View file

@ -0,0 +1,61 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
;;;
;;; This file contains definitions for vectorized access
;;; to specialized lisp arrays.
;;;
(in-package #:SSE)
;;; Prefetch: AREF-PREFETCH-*, ROW-MAJOR-AREF-PREFETCH-*
(def-aref-intrinsic #:PREFETCH-T0 nil cpu-prefetch-t0 nil :check-bounds nil)
(def-aref-intrinsic #:PREFETCH-T1 nil cpu-prefetch-t1 nil :check-bounds nil)
(def-aref-intrinsic #:PREFETCH-T2 nil cpu-prefetch-t2 nil :check-bounds nil)
(def-aref-intrinsic #:PREFETCH-NTA nil cpu-prefetch-nta nil :check-bounds nil)
(def-aref-intrinsic #:CLFLUSH nil cpu-clflush nil :check-bounds :no-gap)
;;; Single-float
;; AREF-PS, ROW-MAJOR-AREF-PS
(def-aref-intrinsic #:PS float-sse-pack mem-ref-ps mem-set-ps)
;; AREF-APS, ROW-MAJOR-AREF-APS (requires alignment)
(def-aref-intrinsic #:APS float-sse-pack mem-ref-aps mem-set-aps)
;; AREF-SPS, ROW-MAJOR-AREF-SPS (requires alignment; no write cache)
(def-aref-intrinsic #:SPS float-sse-pack mem-ref-aps stream-ps)
;;; Double-float
;; AREF-PD, ROW-MAJOR-AREF-PD
(def-aref-intrinsic #:PD double-sse-pack mem-ref-pd mem-set-pd)
;; AREF-APD, ROW-MAJOR-AREF-APD (requires alignment)
(def-aref-intrinsic #:APD double-sse-pack mem-ref-apd mem-set-apd)
;; AREF-SPD, ROW-MAJOR-AREF-SPD (requires alignment; no write cache)
(def-aref-intrinsic #:SPD double-sse-pack mem-ref-apd stream-pd)
;;; Integer
;; AREF-PI, ROW-MAJOR-AREF-PI
(def-aref-intrinsic #:PI int-sse-pack mem-ref-pi mem-set-pi)
;; AREF-API, ROW-MAJOR-AREF-API (requires alignment)
(def-aref-intrinsic #:API int-sse-pack mem-ref-api mem-set-api)
;; AREF-SPI, ROW-MAJOR-AREF-SPI (requires alignment; no write cache)
(def-aref-intrinsic #:SPI int-sse-pack mem-ref-api stream-pi)

View file

@ -0,0 +1,689 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
;;;
;;; This file contains definitions for all SSE intrinsics.
;;;
;;; The macros are defined in the *-core.lisp files.
;;; On SBCL wrapping functions are defined by sbcl-functions.lisp.
;;;
(in-package #:SSE)
#+(and ecl (or ecl_min stage1 cross))
(eval-when (:compile-toplevel)
;; During the initial bootstrap sequence when the contribs are
;; compiled, the system does not load fasls after building them.
;; (For all it knows, it might be cross-compiling to another architecture.)
;; Work around by loading the macro definition file into the interpreter:
(load (merge-pathnames #P"ecl-sse-core.lisp" *compile-file-truename*)))
;;; Prefetch
(def-load-intrinsic cpu-prefetch-t0 nil prefetch "_mm_prefetch" :tags (:t0) :size :byte :postfix-fmt ",_MM_HINT_T0")
(def-load-intrinsic cpu-prefetch-t1 nil prefetch "_mm_prefetch" :tags (:t1) :size :byte :postfix-fmt ",_MM_HINT_T1")
(def-load-intrinsic cpu-prefetch-t2 nil prefetch "_mm_prefetch" :tags (:t2) :size :byte :postfix-fmt ",_MM_HINT_T2")
(def-load-intrinsic cpu-prefetch-nta nil prefetch "_mm_prefetch" :tags (:nta) :size :byte :postfix-fmt ",_MM_HINT_NTA")
(def-load-intrinsic cpu-clflush nil clflush "_mm_clflush" :size :byte)
;;; CPU control
#+sbcl
(progn
(defknown cpu-mxcsr () (unsigned-byte 32) (flushable))
(define-vop (cpu-mxcsr)
(:translate cpu-mxcsr)
(:args) (:arg-types)
(:results (result :scs (unsigned-reg)))
(:result-types unsigned-num)
(:temporary (:sc unsigned-stack) tmp)
(:policy :fast-safe)
(:generator 3
(let ((ea (make-ea :dword :base rbp-tn
:disp (frame-byte-offset (tn-offset tmp)))))
(inst stmxcsr ea)
(inst mov (reg-in-size result :dword) ea))))
(defknown %set-cpu-mxcsr ((unsigned-byte 32)) (unsigned-byte 32) (unsafe))
(define-vop (%set-cpu-mxcsr)
(:translate %set-cpu-mxcsr)
(:args (value :scs (unsigned-reg unsigned-stack) :target result))
(:arg-types unsigned-num)
(:results (result :scs (unsigned-reg)
:load-if (not (and (sc-is result unsigned-stack)
(or (sc-is value unsigned-reg)
(location= value result))))))
(:result-types unsigned-num)
(:temporary (:sc unsigned-stack) tmp)
(:policy :fast-safe)
(:generator 3
(cond ((sc-is value unsigned-stack)
(setf tmp value))
((sc-is result unsigned-stack)
(setf tmp result)))
(move tmp value)
(unless (location= result tmp)
(move result value))
(let ((ea (make-ea :dword :base rbp-tn
:disp (frame-byte-offset (tn-offset tmp)))))
(inst ldmxcsr ea))))
(macrolet ((defvoid (name insn)
`(progn
(export ',name)
(defknown ,name () (values) ())
(define-vop (,name)
(:translate ,name)
(:policy :fast-safe)
(:generator 1
(inst ,insn))))))
(defvoid cpu-load-fence lfence)
(defvoid cpu-store-fence sfence)
(defvoid cpu-memory-fence mfence)
(defvoid cpu-pause pause)))
#+ecl
(progn
(def-intrinsic cpu-mxcsr () fixnum "_mm_getcsr")
(def-intrinsic %set-cpu-mxcsr (fixnum) fixnum "_mm_setcsr" :export nil :ret-arg 0)
(defsetf cpu-mxcsr %set-cpu-mxcsr)
(def-intrinsic cpu-load-fence () nil "_mm_lfence")
(def-intrinsic cpu-store-fence () nil "_mm_sfence")
(def-intrinsic cpu-memory-fence () nil "_mm_mfence")
(def-intrinsic cpu-pause () nil "_mm_pause"))
;;; Single-float
;; Initialization
#+sbcl
(def-float-set-intrinsic set-ss %set-ss single-float single-reg float-sse-pack movaps)
#+ecl
(progn
(def-intrinsic set-ss (single-float) float-sse-pack "_mm_set_ss")
(def-intrinsic set1-ps (single-float) float-sse-pack "_mm_set1_ps")
(def-intrinsic set-ps (single-float single-float single-float single-float) float-sse-pack "_mm_set_ps")
(def-intrinsic setr-ps (single-float single-float single-float single-float) float-sse-pack "_mm_setr_ps")
(def-intrinsic setzero-ps () float-sse-pack "_mm_setzero_ps"))
;; Memory
(def-load-intrinsic mem-ref-ss float-sse-pack movss "_mm_load_ss")
(def-load-intrinsic mem-ref-ps float-sse-pack movups "_mm_loadu_ps")
(def-load-intrinsic mem-ref-aps float-sse-pack movaps "_mm_load_ps")
(def-store-intrinsic mem-set-ss float-sse-pack movss "_mm_store_ss" :setf-name mem-ref-ss)
(def-store-intrinsic mem-set-ps float-sse-pack movups "_mm_storeu_ps" :setf-name mem-ref-ps)
(def-store-intrinsic mem-set-aps float-sse-pack movaps "_mm_store_ps" :setf-name mem-ref-aps)
(def-store-intrinsic stream-ps float-sse-pack movntps "_mm_stream_ps")
;; Arithmetics
(def-binary-intrinsic add-ss float-sse-pack addss 3 "_mm_add_ss")
(def-binary-intrinsic add-ps float-sse-pack addps 3 "_mm_add_ps" :commutative t)
(def-binary-intrinsic sub-ss float-sse-pack subss 3 "_mm_sub_ss")
(def-binary-intrinsic sub-ps float-sse-pack subps 3 "_mm_sub_ps")
(def-binary-intrinsic mul-ss float-sse-pack mulss 5 "_mm_mul_ss")
(def-binary-intrinsic mul-ps float-sse-pack mulps 5 "_mm_mul_ps" :commutative t)
(def-binary-intrinsic div-ss float-sse-pack divss 13 "_mm_div_ss")
(def-binary-intrinsic div-ps float-sse-pack divps 13 "_mm_div_ps")
(def-binary-intrinsic min-ss float-sse-pack minss 3 "_mm_min_ss")
(def-binary-intrinsic min-ps float-sse-pack minps 3 "_mm_min_ps":commutative t)
(def-binary-intrinsic max-ss float-sse-pack maxss 3 "_mm_max_ss")
(def-binary-intrinsic max-ps float-sse-pack maxps 3 "_mm_max_ps" :commutative t)
(def-unary-intrinsic sqrt-ss float-sse-pack sqrtss 20 "_mm_sqrt_ss" :partial t)
(def-unary-intrinsic sqrt-ps float-sse-pack sqrtps 20 "_mm_sqrt_ps")
(def-unary-intrinsic rsqrt-ss float-sse-pack rsqrtss 20 "_mm_rsqrt_ss" :partial t)
(def-unary-intrinsic rsqrt-ps float-sse-pack rsqrtps 20 "_mm_rsqrt_ps")
(def-unary-intrinsic rcp-ss float-sse-pack rcpss 13 "_mm_rcp_ss" :partial t)
(def-unary-intrinsic rcp-ps float-sse-pack rcpps 13 "_mm_rcp_ps")
;; Bitwise logic
#+sbcl
(def-not-intrinsic not-ps float-sse-pack xorps)
(def-binary-intrinsic and-ps float-sse-pack andps 1 "_mm_and_ps" :commutative t)
(def-binary-intrinsic andnot-ps float-sse-pack andnps 1 "_mm_andnot_ps")
(def-binary-intrinsic or-ps float-sse-pack orps 1 "_mm_or_ps" :commutative t)
(def-binary-intrinsic xor-ps float-sse-pack xorps 1 "_mm_xor_ps" :commutative t)
;; Comparisons
(def-binary-intrinsic =-ss float-sse-pack cmpss 3 "_mm_cmpeq_ss" :tags (:eq))
(def-binary-intrinsic =-ps float-sse-pack cmpps 3 "_mm_cmpeq_ps" :tags (:eq) :commutative t)
(def-binary-intrinsic <-ss float-sse-pack cmpss 3 "_mm_cmplt_ss" :tags (:lt))
(def-binary-intrinsic <-ps float-sse-pack cmpps 3 "_mm_cmplt_ps" :tags (:lt))
(def-binary-intrinsic <=-ss float-sse-pack cmpss 3 "_mm_cmple_ss" :tags (:le))
(def-binary-intrinsic <=-ps float-sse-pack cmpps 3 "_mm_cmple_ps" :tags (:le))
#+ecl
(def-binary-intrinsic >-ss float-sse-pack nil nil "_mm_cmpgt_ss")
#+ecl
(def-binary-intrinsic >-ps float-sse-pack nil nil "_mm_cmpgt_ps")
#+ecl
(def-binary-intrinsic >=-ss float-sse-pack nil nil "_mm_cmpge_ss")
#+ecl
(def-binary-intrinsic >=-ps float-sse-pack nil nil "_mm_cmpge_ps")
(def-binary-intrinsic /=-ss float-sse-pack cmpss 3 "_mm_cmpneq_ss" :tags (:neq))
(def-binary-intrinsic /=-ps float-sse-pack cmpps 3 "_mm_cmpneq_ps" :tags (:neq) :commutative t)
(def-binary-intrinsic /<-ss float-sse-pack cmpss 3 "_mm_cmpnlt_ss" :tags (:nlt))
(def-binary-intrinsic /<-ps float-sse-pack cmpps 3 "_mm_cmpnlt_ps" :tags (:nlt))
(def-binary-intrinsic /<=-ss float-sse-pack cmpss 3 "_mm_cmpnle_ss" :tags (:nle))
(def-binary-intrinsic /<=-ps float-sse-pack cmpps 3 "_mm_cmpnle_ps" :tags (:nle))
#+ecl
(def-binary-intrinsic />-ss float-sse-pack nil nil "_mm_cmpngt_ss")
#+ecl
(def-binary-intrinsic />-ps float-sse-pack nil nil "_mm_cmpngt_ps")
#+ecl
(def-binary-intrinsic />=-ss float-sse-pack nil nil "_mm_cmpnge_ss")
#+ecl
(def-binary-intrinsic />=-ps float-sse-pack nil nil "_mm_cmpnge_ps")
(def-binary-intrinsic cmpord-ss float-sse-pack cmpss 3 "_mm_cmpord_ss" :tags (:ord)) ; neither is NaN
(def-binary-intrinsic cmpord-ps float-sse-pack cmpps 3 "_mm_cmpord_ps" :tags (:ord) :commutative t)
(def-binary-intrinsic cmpunord-ss float-sse-pack cmpss 3 "_mm_cmpunord_ss" :tags (:unord))
(def-binary-intrinsic cmpunord-ps float-sse-pack cmpps 3 "_mm_cmpunord_ps" :tags (:unord) :commutative t)
#| Skipped: _mm_u?comi.*_ss |#
;; Misc
(def-binary-intrinsic unpackhi-ps float-sse-pack unpckhps 1 "_mm_unpackhi_ps")
(def-binary-intrinsic unpacklo-ps float-sse-pack unpcklps 1 "_mm_unpacklo_ps")
(def-binary-intrinsic move-ss float-sse-pack movss 1 "_mm_move_ss")
(def-binary-intrinsic movehl-ps float-sse-pack movhlps 1 "_mm_movehl_ps")
(def-binary-intrinsic movelh-ps float-sse-pack movlhps 1 "_mm_movelh_ps")
(def-unary-intrinsic movemask-ps (unsigned-byte 4) movmskps 1 "_mm_movemask_ps" :arg-type float-sse-pack)
;; Shuffle
(def-binary-intrinsic shuffle-ps float-sse-pack shufps 1 "_mm_shuffle_ps" :immediate-arg (unsigned-byte 8))
;; Conversion
(def-unary-intrinsic convert-pi32-to-ps float-sse-pack cvtdq2ps 3 "_mm_cvtepi32_ps" :arg-type int-sse-pack)
(def-unary-intrinsic convert-ps-to-pi32 int-sse-pack cvtps2dq 3 "_mm_cvtps_epi32" :arg-type float-sse-pack)
(def-unary-intrinsic truncate-ps-to-pi32 int-sse-pack cvttps2dq 3 "_mm_cvttps_epi32" :arg-type float-sse-pack)
(def-sse-int-intrinsic convert-si32-to-ss (signed-byte 32) float-sse-pack cvtsi2ss 3 "_mm_cvtsi32_ss")
(def-cvt-to-int32-intrinsic convert-ss-to-si32 (signed-byte 32) cvtss2si 3 "_mm_cvtss_si32" :arg-type float-sse-pack)
(def-cvt-to-int32-intrinsic truncate-ss-to-si32 (signed-byte 32) cvttss2si 3 "_mm_cvttss_si32" :arg-type float-sse-pack)
#+(or x86_64 x86-64)
(def-sse-int-intrinsic convert-si64-to-ss (signed-byte 64) float-sse-pack cvtsi2ss 3
#-msvc "_mm_cvtsi64_ss" #+msvc "_mm_cvtsi64x_ss")
#+(or x86_64 x86-64)
(def-unary-intrinsic convert-ss-to-si64 (signed-byte 64) cvtss2si 3
#-msvc "_mm_cvtss_si64" #+msvc "_mm_cvtss_si64x" :arg-type float-sse-pack)
#+(or x86_64 x86-64)
(def-unary-intrinsic truncate-ss-to-si64 (signed-byte 64) cvttss2si 3
#-msvc "_mm_cvttss_si64" #+msvc "_mm_cvttss_si64x" :arg-type float-sse-pack)
;;; Double-float
;; Initialization
#+sbcl
(def-float-set-intrinsic set-sd %set-sd double-float double-reg double-sse-pack movapd)
#+ecl
(progn
(def-intrinsic set-sd (double-float) double-sse-pack "_mm_set_sd")
(def-intrinsic set1-pd (double-float) double-sse-pack "_mm_set1_pd")
(def-intrinsic set-pd (double-float double-float) double-sse-pack "_mm_set_pd")
(def-intrinsic setr-pd (double-float double-float) double-sse-pack "_mm_setr_pd")
(def-intrinsic setzero-pd () double-sse-pack "_mm_setzero_pd"))
;; Memory
(def-load-intrinsic mem-ref-sd double-sse-pack movsd "_mm_load_sd")
(def-load-intrinsic mem-ref-pd double-sse-pack movupd "_mm_loadu_pd")
(def-load-intrinsic mem-ref-apd double-sse-pack movapd "_mm_load_pd")
(def-load-intrinsic loadh-pd double-sse-pack movhpd "_mm_loadh_pd" :register-arg t)
(def-load-intrinsic loadl-pd double-sse-pack movlpd "_mm_loadl_pd" :register-arg t)
(def-store-intrinsic mem-set-sd double-sse-pack movsd "_mm_store_sd" :setf-name mem-ref-sd)
(def-store-intrinsic mem-set-pd double-sse-pack movupd "_mm_storeu_pd" :setf-name mem-ref-pd)
(def-store-intrinsic mem-set-apd double-sse-pack movapd "_mm_store_pd" :setf-name mem-ref-apd)
(def-store-intrinsic storeh-pd double-sse-pack movhpd "_mm_storeh_pd")
(def-store-intrinsic storel-pd double-sse-pack movlpd "_mm_storel_pd")
(def-store-intrinsic stream-pd double-sse-pack movntpd "_mm_stream_pd")
;; Arithmetics
(def-binary-intrinsic add-sd double-sse-pack addsd 3 "_mm_add_sd")
(def-binary-intrinsic add-pd double-sse-pack addpd 3 "_mm_add_pd" :commutative t)
(def-binary-intrinsic sub-sd double-sse-pack subsd 3 "_mm_sub_sd")
(def-binary-intrinsic sub-pd double-sse-pack subpd 3 "_mm_sub_pd")
(def-binary-intrinsic mul-sd double-sse-pack mulsd 5 "_mm_mul_sd")
(def-binary-intrinsic mul-pd double-sse-pack mulpd 5 "_mm_mul_pd" :commutative t)
(def-binary-intrinsic div-sd double-sse-pack divsd 13 "_mm_div_sd")
(def-binary-intrinsic div-pd double-sse-pack divpd 13 "_mm_div_pd")
(def-binary-intrinsic min-sd double-sse-pack minsd 3 "_mm_min_sd")
(def-binary-intrinsic min-pd double-sse-pack minpd 3 "_mm_min_pd" :commutative t)
(def-binary-intrinsic max-sd double-sse-pack maxsd 3 "_mm_max_sd")
(def-binary-intrinsic max-pd double-sse-pack maxpd 3 "_mm_max_pd" :commutative t)
(def-binary-intrinsic sqrt-sd double-sse-pack sqrtsd 20 "_mm_sqrt_sd")
(def-unary-intrinsic sqrt-pd double-sse-pack sqrtpd 20 "_mm_sqrt_pd")
;; Bitwise logic
#+sbcl
(def-not-intrinsic not-pd double-sse-pack xorpd)
(def-binary-intrinsic and-pd double-sse-pack andpd 1 "_mm_and_pd" :commutative t)
(def-binary-intrinsic andnot-pd double-sse-pack andnpd 1 "_mm_andnot_pd")
(def-binary-intrinsic or-pd double-sse-pack orpd 1 "_mm_or_pd" :commutative t)
(def-binary-intrinsic xor-pd double-sse-pack xorpd 1 "_mm_xor_pd" :commutative t)
;; Comparisons
(def-binary-intrinsic =-sd double-sse-pack cmpsd 3 "_mm_cmpeq_sd" :tags (:eq))
(def-binary-intrinsic =-pd double-sse-pack cmppd 3 "_mm_cmpeq_pd" :tags (:eq) :commutative t)
(def-binary-intrinsic <-sd double-sse-pack cmpsd 3 "_mm_cmplt_sd" :tags (:lt))
(def-binary-intrinsic <-pd double-sse-pack cmppd 3 "_mm_cmplt_pd" :tags (:lt))
(def-binary-intrinsic <=-sd double-sse-pack cmpsd 3 "_mm_cmple_sd" :tags (:le))
(def-binary-intrinsic <=-pd double-sse-pack cmppd 3 "_mm_cmple_pd" :tags (:le))
#+ecl
(def-binary-intrinsic >-sd double-sse-pack nil nil "_mm_cmpgt_sd")
#+ecl
(def-binary-intrinsic >-pd double-sse-pack nil nil "_mm_cmpgt_pd")
#+ecl
(def-binary-intrinsic >=-sd double-sse-pack nil nil "_mm_cmpge_sd")
#+ecl
(def-binary-intrinsic >=-pd double-sse-pack nil nil "_mm_cmpge_pd")
(def-binary-intrinsic /=-sd double-sse-pack cmpsd 3 "_mm_cmpneq_sd" :tags (:neq))
(def-binary-intrinsic /=-pd double-sse-pack cmppd 3 "_mm_cmpneq_pd" :tags (:neq) :commutative t)
(def-binary-intrinsic /<-sd double-sse-pack cmpsd 3 "_mm_cmpnlt_sd" :tags (:nlt))
(def-binary-intrinsic /<-pd double-sse-pack cmppd 3 "_mm_cmpnlt_pd" :tags (:nlt))
(def-binary-intrinsic /<=-sd double-sse-pack cmpsd 3 "_mm_cmpnle_sd" :tags (:nle))
(def-binary-intrinsic /<=-pd double-sse-pack cmppd 3 "_mm_cmpnle_pd" :tags (:nle))
#+ecl
(def-binary-intrinsic />-sd double-sse-pack nil nil "_mm_cmpngt_sd")
#+ecl
(def-binary-intrinsic />-pd double-sse-pack nil nil "_mm_cmpngt_pd")
#+ecl
(def-binary-intrinsic />=-sd double-sse-pack nil nil "_mm_cmpnge_sd")
#+ecl
(def-binary-intrinsic />=-pd double-sse-pack nil nil "_mm_cmpnge_pd")
(def-binary-intrinsic cmpord-sd double-sse-pack cmpsd 3 "_mm_cmpord_sd" :tags (:ord)) ; neither is NaN
(def-binary-intrinsic cmpord-pd double-sse-pack cmppd 3 "_mm_cmpord_pd" :tags (:ord) :commutative t)
(def-binary-intrinsic cmpunord-sd double-sse-pack cmpsd 3 "_mm_cmpunord_sd" :tags (:unord))
(def-binary-intrinsic cmpunord-pd double-sse-pack cmppd 3 "_mm_cmpunord_pd" :tags (:unord) :commutative t)
;; Misc
(def-binary-intrinsic unpackhi-pd double-sse-pack unpckhpd 1 "_mm_unpackhi_pd")
(def-binary-intrinsic unpacklo-pd double-sse-pack unpcklpd 1 "_mm_unpacklo_pd")
(def-binary-intrinsic move-sd double-sse-pack movsd 1 "_mm_move_sd")
(def-unary-intrinsic movemask-pd (unsigned-byte 2) movmskpd 1 "_mm_movemask_pd" :arg-type double-sse-pack)
;; Shuffle
(def-binary-intrinsic shuffle-pd double-sse-pack shufpd 1 "_mm_shuffle_pd" :immediate-arg (unsigned-byte 2))
;; Conversion
(def-unary-intrinsic convert-ps-to-pd double-sse-pack cvtps2pd 3 "_mm_cvtps_pd" :arg-type float-sse-pack)
(def-unary-intrinsic convert-pd-to-ps float-sse-pack cvtpd2ps 3 "_mm_cvtpd_ps" :arg-type double-sse-pack)
(def-binary-intrinsic convert-ss-to-sd double-sse-pack cvtss2sd 3 "_mm_cvtss_sd" :y-type float-sse-pack)
(def-binary-intrinsic convert-sd-to-ss float-sse-pack cvtsd2ss 3 "_mm_cvtsd_ss" :y-type double-sse-pack)
(def-unary-intrinsic convert-pi32-to-pd double-sse-pack cvtdq2pd 3 "_mm_cvtepi32_pd" :arg-type int-sse-pack)
(def-unary-intrinsic convert-pd-to-pi32 int-sse-pack cvtpd2dq 3 "_mm_cvtpd_epi32" :arg-type double-sse-pack)
(def-unary-intrinsic truncate-pd-to-pi32 int-sse-pack cvttpd2dq 3 "_mm_cvttpd_epi32" :arg-type double-sse-pack)
(def-sse-int-intrinsic convert-si32-to-sd (signed-byte 32) double-sse-pack cvtsi2ss 3 "_mm_cvtsi32_sd")
(def-cvt-to-int32-intrinsic convert-sd-to-si32 (signed-byte 32) cvtsd2si 3 "_mm_cvtsd_si32" :arg-type double-sse-pack)
(def-cvt-to-int32-intrinsic truncate-sd-to-si32 (signed-byte 32) cvttsd2si 3 "_mm_cvttsd_si32" :arg-type double-sse-pack)
#+(or x86_64 x86-64)
(def-sse-int-intrinsic convert-si64-to-sd (signed-byte 64) double-sse-pack cvtsi2ss 3
#-msvc "_mm_cvtsi64_sd" #+msvc "_mm_cvtsi64x_sd")
#+(or x86_64 x86-64)
(def-unary-intrinsic convert-sd-to-si64 (signed-byte 64) cvtsd2si 3
#-msvc "_mm_cvtsd_si64" #+msvc "_mm_cvtsd_si64x" :arg-type double-sse-pack)
#+(or x86_64 x86-64)
(def-unary-intrinsic truncate-sd-to-si64 (signed-byte 64) cvttsd2si 3
#-msvc "_mm_cvttsd_si64" #+msvc "_mm_cvttsd_si64x" :arg-type double-sse-pack)
;;; Integer
;; Initialization
#+ecl
(progn
(def-intrinsic set1-pi8 (fixnum) int-sse-pack "_mm_set1_epi8")
(def-intrinsic set1-pi16 (fixnum) int-sse-pack "_mm_set1_epi16")
(def-intrinsic set1-pi32 (ext:integer32) int-sse-pack "_mm_set1_epi32")
#+x86_64
(def-intrinsic set1-pi64 (ext:integer64) int-sse-pack "_mm_set1_epi64x")
(def-intrinsic set1-pu32 (ext:byte32) int-sse-pack "_mm_set1_epi32")
#+x86_64
(def-intrinsic set1-pu64 (ext:byte64) int-sse-pack "_mm_set1_epi64x")
;;-----
(def-intrinsic set-pi8 (fixnum fixnum fixnum fixnum
fixnum fixnum fixnum fixnum
fixnum fixnum fixnum fixnum
fixnum fixnum fixnum fixnum) int-sse-pack "_mm_set_epi8")
(def-intrinsic set-pi16 (fixnum fixnum fixnum fixnum
fixnum fixnum fixnum fixnum) int-sse-pack "_mm_set_epi16")
(def-intrinsic set-pi32 (ext:integer32 ext:integer32 ext:integer32 ext:integer32) int-sse-pack "_mm_set_epi32")
#+x86_64
(def-intrinsic set-pi64 (ext:integer64 ext:integer64) int-sse-pack "_mm_set_epi64x")
(def-intrinsic set-pu32 (ext:byte32 ext:byte32 ext:byte32 ext:byte32) int-sse-pack "_mm_set_epi32")
#+x86_64
(def-intrinsic set-pu64 (ext:byte64 ext:byte64) int-sse-pack "_mm_set_epi64x")
;;-----
(def-intrinsic setr-pi8 (fixnum fixnum fixnum fixnum
fixnum fixnum fixnum fixnum
fixnum fixnum fixnum fixnum
fixnum fixnum fixnum fixnum) int-sse-pack "_mm_setr_epi8")
(def-intrinsic setr-pi16 (fixnum fixnum fixnum fixnum
fixnum fixnum fixnum fixnum) int-sse-pack "_mm_setr_epi16")
(def-intrinsic setr-pi32 (ext:integer32 ext:integer32 ext:integer32 ext:integer32) int-sse-pack "_mm_setr_epi32")
#+x86_64
(def-intrinsic setr-pi64 (ext:integer64 ext:integer64) int-sse-pack "_mm_set_epi64x" :reorder-args t)
(def-intrinsic setr-pu32 (ext:byte32 ext:byte32 ext:byte32 ext:byte32) int-sse-pack "_mm_setr_epi32")
#+x86_64
(def-intrinsic setr-pu64 (ext:byte64 ext:byte64) int-sse-pack "_mm_set_epi64x" :reorder-args t)
;;-----
(def-intrinsic setzero-pi () int-sse-pack "_mm_setzero_si128"))
;; Memory
(def-load-intrinsic mem-ref-pi int-sse-pack movdqu "_mm_loadu_si128")
(def-load-intrinsic mem-ref-api int-sse-pack movdqa "_mm_load_si128")
(def-load-intrinsic mem-ref-si64 int-sse-pack movd "_mm_loadl_epi64")
(def-store-intrinsic mem-set-pi int-sse-pack movdqu "_mm_storeu_si128" :setf-name mem-ref-pi)
(def-store-intrinsic mem-set-api int-sse-pack movdqa "_mm_store_si128" :setf-name mem-ref-api)
(def-store-intrinsic mem-set-si64 int-sse-pack movd "_mm_storel_epi64" :setf-name mem-ref-si64)
(def-store-intrinsic stream-pi int-sse-pack movntdq "_mm_stream_si128")
;; Masked move
#+ecl
(def-mem-intrinsic maskmoveu-pi "char" nil "_mm_maskmoveu_si128" :prefix-args (int-sse-pack int-sse-pack))
#+sbcl
(progn
(defknown %maskmoveu-pi (sse-pack sse-pack system-area-pointer fixnum) (values) (unsafe))
(define-vop (%maskmoveu-pi)
(:translate %maskmoveu-pi)
(:args (value :scs (sse-reg))
(mask :scs (sse-reg))
(sap :scs (sap-reg) :target rdi)
(offset :scs (signed-reg)))
(:arg-types sse-pack sse-pack system-area-pointer signed-num)
(:temporary (:sc sap-reg :offset rdi-offset :from :eval) rdi)
(:policy :fast-safe)
(:note "inline MASKMOVEU operation")
(:generator 5
(if (location= sap rdi)
(inst add rdi offset)
(inst lea rdi (make-ea :qword :base sap :index offset)))
(inst maskmovdqu value mask)))
(define-vop (%maskmoveu-pi-c)
(:translate %maskmoveu-pi)
(:args (value :scs (sse-reg))
(mask :scs (sse-reg))
(sap :scs (sap-reg) :target rdi))
(:arg-types sse-pack sse-pack system-area-pointer (:constant (signed-byte 32)))
(:info offset)
(:temporary (:sc sap-reg :offset rdi-offset :from :eval) rdi)
(:policy :fast-safe)
(:note "inline MASKMOVEU operation")
(:generator 4
(if (location= sap rdi)
(unless (= offset 0)
(inst add rdi offset))
(if (= offset 0)
(inst mov rdi sap)
(inst lea rdi (make-ea :qword :base sap :disp offset))))
(inst maskmovdqu value mask)))
(def-splice-transform %maskmoveu-pi (value mask (sap+ sap offset1) offset2)
(%maskmoveu-pi value mask sap (+ offset1 offset2))))
;; Arithmetics
(def-binary-intrinsic add-pi8 int-sse-pack paddb 1 "_mm_add_epi8" :commutative t)
(def-binary-intrinsic add-pi16 int-sse-pack paddw 1 "_mm_add_epi16" :commutative t)
(def-binary-intrinsic add-pi32 int-sse-pack paddd 1 "_mm_add_epi32" :commutative t)
(def-binary-intrinsic add-pi64 int-sse-pack paddq 1 "_mm_add_epi64" :commutative t)
(def-binary-intrinsic adds-pi8 int-sse-pack paddsb 1 "_mm_adds_epi8" :commutative t)
(def-binary-intrinsic adds-pi16 int-sse-pack paddsw 1 "_mm_adds_epi16" :commutative t)
(def-binary-intrinsic adds-pu8 int-sse-pack paddusb 1 "_mm_adds_epu8" :commutative t)
(def-binary-intrinsic adds-pu16 int-sse-pack paddusw 1 "_mm_adds_epu16" :commutative t)
(def-binary-intrinsic avg-pu8 int-sse-pack pavgb 1 "_mm_avg_epu8" :commutative t)
(def-binary-intrinsic avg-pu16 int-sse-pack pavgw 1 "_mm_avg_epu16" :commutative t)
(def-binary-intrinsic madd-pi16 int-sse-pack pmaddwd 1 "_mm_madd_epi16" :commutative t)
(def-binary-intrinsic max-pu8 int-sse-pack pmaxub 1 "_mm_max_epu8" :commutative t)
(def-binary-intrinsic max-pi16 int-sse-pack pmaxsw 1 "_mm_max_epi16" :commutative t)
(def-binary-intrinsic min-pu8 int-sse-pack pminub 1 "_mm_min_epu8" :commutative t)
(def-binary-intrinsic min-pi16 int-sse-pack pminsw 1 "_mm_min_epi16" :commutative t)
(def-binary-intrinsic mulhi-pi16 int-sse-pack pmulhw 3 "_mm_mulhi_epi16" :commutative t)
(def-binary-intrinsic mulhi-pu16 int-sse-pack pmulhuw 3 "_mm_mulhi_epu16" :commutative t)
(def-binary-intrinsic mullo-pi16 int-sse-pack pmullw 3 "_mm_mullo_epi16" :commutative t)
(def-binary-intrinsic mul-pu32 int-sse-pack pmuludq 3 "_mm_mul_epu32" :commutative t)
(def-binary-intrinsic sad-pu8 int-sse-pack psadbw 1 "_mm_sad_epu8" :commutative t)
(def-binary-intrinsic sub-pi8 int-sse-pack psubb 1 "_mm_sub_epi8")
(def-binary-intrinsic sub-pi16 int-sse-pack psubw 1 "_mm_sub_epi16")
(def-binary-intrinsic sub-pi32 int-sse-pack psubd 1 "_mm_sub_epi32")
(def-binary-intrinsic sub-pi64 int-sse-pack psubq 1 "_mm_sub_epi64")
(def-binary-intrinsic subs-pi8 int-sse-pack psubsb 1 "_mm_subs_epi8")
(def-binary-intrinsic subs-pi16 int-sse-pack psubsw 1 "_mm_subs_epi16")
(def-binary-intrinsic subs-pu8 int-sse-pack psubusb 1 "_mm_subs_epu8")
(def-binary-intrinsic subs-pu16 int-sse-pack psubusw 1 "_mm_subs_epu16")
;; Bitwise logic
#+sbcl
(def-not-intrinsic not-pi int-sse-pack pxor)
(def-binary-intrinsic and-pi int-sse-pack pand 1 "_mm_and_si128" :commutative t)
(def-binary-intrinsic andnot-pi int-sse-pack pandn 1 "_mm_andnot_si128")
(def-binary-intrinsic or-pi int-sse-pack por 1 "_mm_or_si128" :commutative t)
(def-binary-intrinsic xor-pi int-sse-pack pxor 1 "_mm_xor_si128" :commutative t)
;; Shifts
(def-unary-intrinsic slli-pi int-sse-pack pslldq 1 "_mm_slli_si128" :partial :one-arg :immediate-arg (unsigned-byte 8))
(def-sse-int-intrinsic slli-pi16 fixnum int-sse-pack psllw 3 "_mm_slli_epi16" :make-temporary t)
(def-sse-int-intrinsic slli-pi32 fixnum int-sse-pack pslld 3 "_mm_slli_epi32" :make-temporary t)
(def-sse-int-intrinsic slli-pi64 fixnum int-sse-pack psllq 3 "_mm_slli_epi64" :make-temporary t)
(def-binary-intrinsic sll-pi16 int-sse-pack psllw 1 "_mm_sll_epi16")
(def-binary-intrinsic sll-pi32 int-sse-pack pslld 1 "_mm_sll_epi32")
(def-binary-intrinsic sll-pi64 int-sse-pack psllq 1 "_mm_sll_epi64")
(def-sse-int-intrinsic srai-pi16 fixnum int-sse-pack psraw 3 "_mm_srai_epi16" :make-temporary t)
(def-sse-int-intrinsic srai-pi32 fixnum int-sse-pack psrad 3 "_mm_srai_epi32" :make-temporary t)
(def-binary-intrinsic sra-pi16 int-sse-pack psraw 1 "_mm_sra_epi16")
(def-binary-intrinsic sra-pi32 int-sse-pack psrad 1 "_mm_sra_epi32")
(def-unary-intrinsic srli-pi int-sse-pack psrldq 1 "_mm_srli_si128" :partial :one-arg :immediate-arg (unsigned-byte 8))
(def-sse-int-intrinsic srli-pi16 fixnum int-sse-pack psrlw 3 "_mm_srli_epi16" :make-temporary t)
(def-sse-int-intrinsic srli-pi32 fixnum int-sse-pack psrld 3 "_mm_srli_epi32" :make-temporary t)
(def-sse-int-intrinsic srli-pi64 fixnum int-sse-pack psrlq 3 "_mm_srli_epi64" :make-temporary t)
(def-binary-intrinsic srl-pi16 int-sse-pack psrlw 1 "_mm_srl_epi16")
(def-binary-intrinsic srl-pi32 int-sse-pack psrld 1 "_mm_srl_epi32")
(def-binary-intrinsic srl-pi64 int-sse-pack psrlq 1 "_mm_srl_epi64")
#+sbcl
(macrolet ((defimm (name insn bits &key arithmetic)
`(define-vop (,(symbolicate "%" name "-IMM") sse-int-base-op)
(:translate ,name)
(:args (x :scs (sse-reg) :target r))
(:arg-types sse-pack (:constant fixnum))
(:result-types sb-kernel:int-sse-pack)
(:info immv)
(:generator 1
,@(let ((core `(progn
(ensure-move int-sse-pack r x)
(unless (= immv 0)
(inst ,insn r immv)))))
(if arithmetic
`((when (or (< immv 0) (>= immv ,bits))
(setf immv ,bits))
,core)
`((if (or (< immv 0) (>= immv ,bits))
(inst pxor r r)
,core))))))))
(defimm slli-pi16 psllw-imm 16)
(defimm slli-pi32 pslld-imm 32)
(defimm slli-pi64 psllq-imm 64)
(defimm srai-pi16 psraw-imm 16 :arithmetic t)
(defimm srai-pi32 psrad-imm 32 :arithmetic t)
(defimm srli-pi16 psrlw-imm 16)
(defimm srli-pi32 psrld-imm 32)
(defimm srli-pi64 psrlq-imm 64))
;; Comparisons
(def-binary-intrinsic =-pi8 int-sse-pack pcmpeqb 1 "_mm_cmpeq_epi8")
(def-binary-intrinsic =-pi16 int-sse-pack pcmpeqw 1 "_mm_cmpeq_epi16")
(def-binary-intrinsic =-pi32 int-sse-pack pcmpeqd 1 "_mm_cmpeq_epi32")
#+ecl
(def-binary-intrinsic <-pi8 int-sse-pack nil nil "_mm_cmplt_epi8")
#+ecl
(def-binary-intrinsic <-pi16 int-sse-pack nil nil "_mm_cmplt_epi16")
#+ecl
(def-binary-intrinsic <-pi32 int-sse-pack nil nil "_mm_cmplt_epi32")
(def-binary-intrinsic >-pi8 int-sse-pack pcmpgtb 1 "_mm_cmpgt_epi8")
(def-binary-intrinsic >-pi16 int-sse-pack pcmpgtw 1 "_mm_cmpgt_epi16")
(def-binary-intrinsic >-pi32 int-sse-pack pcmpgtd 1 "_mm_cmpgt_epi32")
;; Misc
(def-binary-intrinsic packs-pi16 int-sse-pack packsswb 1 "_mm_packs_epi16")
(def-binary-intrinsic packs-pi32 int-sse-pack packssdw 1 "_mm_packs_epi32")
(def-binary-intrinsic packus-pi16 int-sse-pack packuswb 1 "_mm_packus_epi16")
(def-unary-intrinsic extract-pi16 (unsigned-byte 16) pextrw 1 "_mm_extract_epi16"
:immediate-arg (unsigned-byte 8) :arg-type int-sse-pack)
(def-sse-int-intrinsic insert-pi16 fixnum int-sse-pack pinsrw 1 "_mm_insert_epi16"
:immediate-arg (unsigned-byte 8))
(def-unary-intrinsic movemask-pi8 (unsigned-byte 16) pmovmskb 1 "_mm_movemask_epi8" :arg-type int-sse-pack)
(def-binary-intrinsic unpackhi-pi8 int-sse-pack punpckhbw 1 "_mm_unpackhi_epi8")
(def-binary-intrinsic unpackhi-pi16 int-sse-pack punpckhwd 1 "_mm_unpackhi_epi16")
(def-binary-intrinsic unpackhi-pi32 int-sse-pack punpckhdq 1 "_mm_unpackhi_epi32")
(def-binary-intrinsic unpackhi-pi64 int-sse-pack punpckhqdq 1 "_mm_unpackhi_epi64")
(def-binary-intrinsic unpacklo-pi8 int-sse-pack punpcklbw 1 "_mm_unpacklo_epi8")
(def-binary-intrinsic unpacklo-pi16 int-sse-pack punpcklwd 1 "_mm_unpacklo_epi16")
(def-binary-intrinsic unpacklo-pi32 int-sse-pack punpckldq 1 "_mm_unpacklo_epi32")
(def-binary-intrinsic unpacklo-pi64 int-sse-pack punpcklqdq 1 "_mm_unpacklo_epi64")
(def-unary-intrinsic move-pi64 int-sse-pack movq 1 "_mm_move_epi64")
;; Shuffle
(def-unary-intrinsic shuffle-pi32 int-sse-pack pshufd 1 "_mm_shuffle_epi32" :immediate-arg (unsigned-byte 8))
(def-unary-intrinsic shufflelo-pi16 int-sse-pack pshuflw 1 "_mm_shufflelo_epi16" :immediate-arg (unsigned-byte 8))
(def-unary-intrinsic shufflehi-pi16 int-sse-pack pshufhw 1 "_mm_shufflehi_epi16" :immediate-arg (unsigned-byte 8))
;; Conversion
#+sbcl
(progn
(export 'convert-si32-to-pi)
(defknown convert-si32-to-pi ((signed-byte 32)) int-sse-pack (foldable flushable))
(export 'convert-su32-to-pi)
(defknown convert-su32-to-pi ((unsigned-byte 32)) int-sse-pack (foldable flushable))
(export 'convert-si64-to-pi)
(defknown convert-si64-to-pi ((signed-byte 64)) int-sse-pack (foldable flushable))
(export 'convert-su64-to-pi)
(defknown convert-su64-to-pi ((unsigned-byte 64)) int-sse-pack (foldable flushable))
(defknown %set-int ((signed-byte 64)) int-sse-pack (foldable flushable always-translatable))
(defknown %set-uint ((unsigned-byte 64)) int-sse-pack (foldable flushable always-translatable))
(define-vop (%set-int)
(:translate %set-int %set-uint
convert-si32-to-pi convert-su32-to-pi
convert-si64-to-pi convert-su64-to-pi)
(:args (arg :scs (signed-reg unsigned-reg signed-stack unsigned-stack)))
(:arg-types untagged-num)
(:results (dst :scs (sse-reg)))
(:result-types sb-kernel:int-sse-pack)
(:policy :fast-safe)
(:generator 1
(inst movd dst arg))))
#+ecl
(progn
(def-intrinsic convert-si32-to-pi (ext:integer32) int-sse-pack "_mm_cvtsi32_si128")
(def-intrinsic convert-su32-to-pi (ext:byte32) int-sse-pack "_mm_cvtsi32_si128")
#+x86_64
(def-intrinsic convert-si64-to-pi (ext:integer64) int-sse-pack #-msvc "_mm_cvtsi64_si128" #+msvc "_mm_cvtsi64x_si128")
#+x86_64
(def-intrinsic convert-su64-to-pi (ext:byte64) int-sse-pack #-msvc "_mm_cvtsi64_si128" #+msvc "_mm_cvtsi64x_si128"))
(def-cvt-to-int32-intrinsic convert-pi-to-si32 (signed-byte 32) movd 1 "_mm_cvtsi128_si32"
:arg-type int-sse-pack)
(def-unary-intrinsic convert-pi-to-su32 (unsigned-byte 32) movd 1 "_mm_cvtsi128_si32"
:result-size :dword :arg-type int-sse-pack)
#+(or x86_64 x86-64)
(def-unary-intrinsic convert-pi-to-si64 (signed-byte 64) movd 1
#-msvc "_mm_cvtsi128_si64" #+msvc "_mm_cvtsi128_si64x" :arg-type int-sse-pack)
#+(or x86_64 x86-64)
(def-unary-intrinsic convert-pi-to-su64 (unsigned-byte 64) movd 1
#-msvc "_mm_cvtsi128_si64" #+msvc "_mm_cvtsi128_si64x" :arg-type int-sse-pack)

View file

@ -0,0 +1,61 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
;;;
;;; This file defines a package for all SSE intrinsics.
;;;
#+ecl
(eval-when (:load-toplevel)
(require 'cmp))
#+sbcl
(pushnew :SSE2 *features*)
(defpackage #:SSE
#+sbcl
(:use #:COMMON-LISP #:SB-C #:SB-VM #:SB-INT #:SB-KERNEL #:SB-ASSEM #:SB-EXT #:SB-SYS)
#+sbcl
(:import-from #:SB-VM
#:SINGLE-REG #:DOUBLE-REG #:SSE-REG #:SSE-PACK-IMMEDIATE
#:SIGNED-REG #:SIGNED-STACK #:UNSIGNED-REG #:UNSIGNED-STACK
#:SIGNED-NUM #:UNSIGNED-NUM #:UNTAGGED-NUM #:IMMEDIATE
#:SAP-REG #:DESCRIPTOR-REG #:ANY-REG #:TAGGED-NUM
#:RAX-OFFSET #:RDI-OFFSET #:RBP-TN #:FRAME-BYTE-OFFSET
#:MAKE-EA #:REG-IN-SIZE #:LOADW)
#+sbcl
(:import-from #:SB-C
#:SPLICE-FUN-ARGS #:EXTRACT-FUN-ARGS
#:%DEFTRANSFORM #:COMMUTATIVE-ARG-SWAP
#:GIVE-UP-IR1-TRANSFORM #:ABORT-IR1-TRANSFORM
#:INSERT-ARRAY-BOUNDS-CHECKS #:VECTOR-LENGTH
#:ASSERT-ARRAY-RANK #:ASSERT-LVAR-TYPE
#:CONSTANT-LVAR-P #:LVAR-VALUE #:LVAR-TYPE
#:LEXENV-POLICY #:NODE-LEXENV
#:FIND-SAETP #:FIND-SAETP-BY-CTYPE)
#+sbcl
(:import-from #:SB-IMPL
#:%ARRAY-ROW-MAJOR-INDEX)
#+sbcl
(:shadow #:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK)
#+ecl
(:use #:COMMON-LISP #:FFI)
#+ecl
(:import-from #:EXT
#:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK
#:SSE-PACK-P #:ARRAY-ELEMENT-TYPE-BYTE-SIZE
#:*SSE-PACK-PRINT-MODE*)
#+ecl
(:shadow #:SSE-PACK)
;; Common exports:
(:export #:SSE-PACK #:SSE-PACK-P
#:INT-SSE-PACK #:FLOAT-SSE-PACK #:DOUBLE-SSE-PACK
#:*SSE-PACK-PRINT-MODE*
#:SSE-ARRAY #:MAKE-SSE-ARRAY
#:0.0-PS #:TRUE-SS #:FALSE-SS #:TRUE-PS #:FALSE-PS
#:SET1-PS #:SET-PS #:SETR-PS #:SETZERO-PS
#:0.0-PD #:TRUE-SD #:FALSE-SD #:TRUE-PD #:FALSE-PD
#:SET1-PD #:SET-PD #:SETR-PD #:SETZERO-PD
#:0-PI #:TRUE-PI #:FALSE-PI #:SETZERO-PI
#:CPU-MXCSR #:CPU-MXCSR-BITS #:WITH-SAVED-MXCSR #:CPU-CONFIGURE-ROUNDING))

View file

@ -0,0 +1,128 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Copyright (c) 2010, Alexander Gavrilov (angavrilov@gmail.com)
;;;
;;; This file implements some common utility functions.
;;;
(in-package #:SSE)
;;; CPU control
(eval-when (:compile-toplevel :load-toplevel :execute)
(declaim (ftype (function (&rest t) (unsigned-byte 32)) cpu-mxcsr-bits))
(defun cpu-mxcsr-bits (&rest tags)
(loop with mask = 0
for tag in tags
for bit = (if (listp tag)
(apply #'cpu-mxcsr-bits tag)
(ecase tag
(:except-invalid #x1)
(:except-denormal #x2)
(:except-divide-zero #x4)
(:except-overflow #x8)
(:except-underflow #x10)
(:except-precision #x20)
(:except-all #x3F)
(:denormals-are-zero #x40)
(:mask-invalid #x80)
(:mask-denormal #x100)
(:mask-divide-zero #x200)
(:mask-overflow #x400)
(:mask-underflow #x800)
(:mask-precision #x1000)
(:mask-all #x1f80)
(:round-nearest 0)
(:round-negative #x2000)
(:round-positive #x4000)
(:round-zero #x6000)
(:round-bits #x6000)
(:flush-to-zero #x8000)))
do (setf mask (logior mask bit))
finally (return mask)))
(defun expand-cpu-mxcsr-bits (tags on-fail)
(loop for tag in tags
when (keywordp tag) collect tag into kwds
else collect tag into rest
finally
(return
(cond ((and kwds rest)
`(logior ,(apply #'cpu-mxcsr-bits kwds)
(cpu-mxcsr-bits ,@rest)))
(kwds
(apply #'cpu-mxcsr-bits kwds))
(t on-fail))))))
(define-compiler-macro cpu-mxcsr-bits (&whole whole &rest tags)
(expand-cpu-mxcsr-bits tags whole))
(defmacro with-saved-mxcsr (&body code)
(let ((v (gensym "CSR")))
`(let ((,v (cpu-mxcsr)))
(declare (type (unsigned-byte 32) ,v)
#+ecl (:read-only ,v))
(unwind-protect (progn ,@code)
(%set-cpu-mxcsr ,v)))))
#+nil
(defun cpu-check-exceptions (&rest tags)
(let ((mask (logand (cpu-mxcsr-bits (or tags :except-all))
(cpu-mxcsr-bits :except-all)))
(csr (get-cpu-mxcsr)))
(declare (optimize (safety 0) (speed 3) (debug 0))
(type fixnum csr mask))
(not (zerop (logand mask csr)))))
#+nil
(define-compiler-macro cpu-check-exceptions (&whole whole &rest tags)
(let ((bits (expand-cpu-mxcsr-bits (or tags '(except-all)) nil)))
(if (integerp bits)
`(locally (declare (optimize (speed 3) (safety 0) (debug 0)))
(not (zerop (logand (cpu-get-mxcsr)
,(logand bits (cpu-mxcsr-bits :except-all))))))
whole)))
#+nil
(macrolet ((foo (&rest names)
(let* ((kwds (mapcar (lambda (x) (intern (format nil "MASK-~A" x) :keyword)) names))
(pvars (mapcar (lambda (x) (intern (format nil "~A-P" x))) names)))
`(defun cpu-mask-exceptions (&key
,@(mapcar (lambda (n p) `(,n nil ,p)) names pvars)
(other nil rest-p))
(let ((set-bits (logior ,@(mapcar (lambda (n k) `(if ,n (cpu-mxcsr-bits ,k) 0)) names kwds)))
(arg-bits (logior ,@(mapcar (lambda (p k) `(if ,p (cpu-mxcsr-bits ,k) 0)) pvars kwds))))
(%set-cpu-mxcsr
(the fixnum
(if (not rest-p)
(logior set-bits (logand (get-cpu-mxcsr) (lognot arg-bits)))
(logior set-bits
(if other (logand (cpu-mxcsr-bits :mask-all) (lognot arg-bits)) 0)
(logiand (get-cpu-mxcsr) (lognot (cpu-mxcsr-bits :mask-all)))))))
nil)))))
(foo invalid denormal divide-zero overflow underflow precision))
(defun cpu-configure-rounding (&key round-to
(denormals-are-zero nil daz-p)
(flush-to-zero nil ftz-p))
(let ((set 0)
(mask 0))
(when round-to
(setf mask (cpu-mxcsr-bits :round-bits)
set (ecase round-to
(:zero (cpu-mxcsr-bits :round-zero))
(:negative (cpu-mxcsr-bits :round-negative))
(:positive (cpu-mxcsr-bits :round-positive))
(:nearest (cpu-mxcsr-bits :round-nearest)))))
(when daz-p
(setf mask (logior mask (cpu-mxcsr-bits :denormals-are-zero)))
(when denormals-are-zero
(setf set (logior set (cpu-mxcsr-bits :denormals-are-zero)))))
(when ftz-p
(setf mask (logior mask (cpu-mxcsr-bits :flush-to-zero)))
(when flush-to-zero
(setf set (logior set (cpu-mxcsr-bits :flush-to-zero)))))
(setf (cpu-mxcsr)
(the (unsigned-byte 32)
(logior set (logand (cpu-mxcsr) (lognot mask)))))
nil))

View file

@ -0,0 +1,156 @@
;;; -*- mode: Lisp; indent-tabs-mode: nil; -*-
;;;
;;; Dumbly translated from C code at: http://github.com/jj1bdx/sfmt-extstate
(eval-when (:compile-toplevel :load-toplevel :execute)
(require :cl-simd))
(defpackage #:sfmt-test
(:use #:common-lisp #:sse))
(in-package #:sfmt-test)
(deftype uint32 () '(unsigned-byte 32))
(deftype uint32-vector () '(sse-array uint32 (*)))
(defconstant +mexp+ 19937)
(defconstant +n+ (1+ (floor +mexp+ 128)))
(defconstant +pos1+ 122)
(defconstant +sl1+ 18)
(defconstant +sl2+ 1)
(defconstant +sr1+ 11)
(defconstant +sr2+ 1)
(defconstant +msk1+ #xdfffffef)
(defconstant +msk2+ #xddfecb7f)
(defconstant +msk3+ #xbffaffff)
(defconstant +msk4+ #xbffffff6)
(defconstant +parity1+ #x00000001)
(defconstant +parity2+ #x00000000)
(defconstant +parity3+ #x00000000)
(defconstant +parity4+ #x13c9e684)
(defconstant +uint32-mask+ #xFFFFFFFF)
(defvar *work-buffer* (make-sse-array (* +n+ 4) :element-type 'uint32))
(defun period-certification (buffer)
(declare (type uint32-vector buffer))
(let ((inner (logxor (logand (aref buffer 0) +parity1+)
(logand (aref buffer 1) +parity2+)
(logand (aref buffer 2) +parity3+)
(logand (aref buffer 3) +parity4+))))
(loop for i = 16 then (ash i -1) while (> i 0)
do (setf inner (logxor inner (ash inner (- i)))))
(when (logtest inner 1)
(return-from period-certification)))
(loop
for i from 0 to 3
for parity in (load-time-value (list +parity1+ +parity2+ +parity3+ +parity4+))
do (loop
for work = 1 then (ash work 1)
for j from 0 below 32
when (/= 0 (logand work parity))
do (progn
(setf (aref buffer i)
(logxor (aref buffer i) work))
(return-from period-certification)))))
(defun init-gen-rand (seed buffer)
(declare (type uint32 seed)
(type uint32-vector buffer))
(setf (aref buffer 0) seed)
(loop for i from 1 below (array-total-size buffer)
do (setf (aref buffer i)
(logand +uint32-mask+
(+ i
(* 1812433253 (logxor (aref buffer (1- i))
(ash (aref buffer (1- i)) -30)))))))
(period-certification buffer))
;; Should be an inline function, but it's broken in ECL
(defmacro recursion (a b c d mask)
`(let ((x ,a)
(y (srli-pi32 ,b +sr1+))
(z (srli-pi ,c +sr2+))
(v (slli-pi32 ,d +sl1+))
(m ,mask))
(xor-pi (xor-pi (xor-pi z x) v)
(xor-pi (slli-pi x +sl2+)
(and-pi y m)))))
(defmacro sfmt-aref (buf idx)
`(row-major-aref-api ,buf (the fixnum (* 4 (the fixnum ,idx)))))
(defun gen-rand-all (buffer)
(declare (optimize (speed 3) #+ecl (safety 0) (debug 0)
#+sbcl (sb-c::insert-array-bounds-checks 0))
(type uint32-vector buffer))
#+ecl (check-type buffer uint32-vector)
(assert (= (array-total-size buffer) (* +n+ 4)))
(let ((mask (set-pu32 +msk4+ +msk3+ +msk2+ +msk1+))
(r1 (sfmt-aref buffer (- +n+ 2)))
(r2 (sfmt-aref buffer (- +n+ 1))))
(declare (type int-sse-pack mask r1 r2))
(macrolet ((twist (delta)
`(psetq r1 r2
r2 (setf (sfmt-aref buffer i)
(recursion (sfmt-aref buffer i)
(sfmt-aref buffer (+ i (the fixnum ,delta)))
r1 r2 mask)))))
(loop for i fixnum from 0 below (- +n+ +pos1+)
do (twist +pos1+))
(loop for i fixnum from (- +n+ +pos1+) below +n+
do (twist (- +pos1+ +n+))))))
(defun gen-rand-array (output buffer)
(declare (optimize (speed 3) #+ecl (safety 0) (debug 0)
#+sbcl (sb-c::insert-array-bounds-checks 0))
(type uint32-vector buffer output))
#+ecl (check-type buffer uint32-vector)
#+ecl (check-type output uint32-vector)
(assert (= (array-total-size buffer) (* +n+ 4)))
(let ((mask (set-pu32 +msk4+ +msk3+ +msk2+ +msk1+))
(size (floor (array-total-size output) 4))
(r1 (sfmt-aref buffer (- +n+ 2)))
(r2 (sfmt-aref buffer (- +n+ 1))))
(declare (type int-sse-pack mask r1 r2)
(type fixnum size))
(assert (> size (* +n+ 2)))
(macrolet ((twist (tgt src1 delta1 src2 delta2)
`(psetq r1 r2
r2 (setf (sfmt-aref ,tgt i)
(recursion (sfmt-aref ,src1 (- i (the fixnum ,delta1)))
(sfmt-aref ,src2 (+ i (the fixnum ,delta2)))
r1 r2 mask)))))
(loop for i fixnum from 0 below (- +n+ +pos1+)
do (twist output buffer 0 buffer +pos1+))
(loop for i fixnum from (- +n+ +pos1+) below +n+
do (twist output buffer 0 output (- +pos1+ +n+)))
(loop for i fixnum from +n+ below (- size +n+)
do (twist output output +n+ output (- +pos1+ +n+)))
#+ ()
(loop for j fixnum from 0 below (- (* 2 +n+) size)
do (setf (sfmt-aref buffer j)
(sfmt-aref output (+ j (the fixnum (- size +n+))))))
(loop
for i fixnum from (- size +n+) below size
for j fixnum from 0 below +n+ ;(max 0 (- (* 2 +n+) size))
do (twist output output +n+ output (- +pos1+ +n+))
do (setf (sfmt-aref buffer j) r2))
output)))
(defun test ()
(let ((out (make-sse-array 10000 :element-type 'uint32)))
(init-gen-rand 1234 *work-buffer*)
(gen-rand-array out *work-buffer*)
(assert (equal (coerce (subseq out 995 1000) 'list)
'(2499610950 3057240914 1662679783 461224431 1168395933)))
(gen-rand-array out *work-buffer*)
(assert (equal (coerce (subseq out 995 1000) 'list)
'(648219337 458306832 3674950976 4030368244 2918117049)))))
(dotimes (i 10)
(test))