fixed dependencies

This commit is contained in:
nuknal
2024-10-24 15:46:01 +08:00
parent d16a5bd9c0
commit 1161e8d054
2005 changed files with 690883 additions and 0 deletions

132
vendor/gonum.org/v1/gonum/AUTHORS generated vendored Normal file
View File

@@ -0,0 +1,132 @@
# This is the official list of Gonum authors for copyright purposes.
# This file is distinct from the CONTRIBUTORS files.
# See the latter for an explanation.
# Names should be added to this file as
# Name or Organization <email address>
# The email address is not required for organizations.
# Please keep the list sorted.
Alexander Egurnov <alexander.egurnov@gmail.com>
Andrei Blinnikov <goofinator@mail.ru>
antichris <chris@u-d13.com>
Bailey Lissington <lissington4@gmail.com>
Bill Gray <wgray@gogray.com>
Bill Noon <noon.bill@gmail.com>
Brendan Tracey <tracey.brendan@gmail.com>
Brent Pedersen <bpederse@gmail.com>
Bulat Khasanov <afti@yandex.ru>
Chad Kunde <kunde21@gmail.com>
Chan Kwan Yin <sofe2038@gmail.com>
Chih-Wei Chang <bert.cwchang@gmail.com>
Chong-Yeol Nah <nahchongyeol@gmail.com>
Chris Tessum <ctessum@gmail.com>
Christophe Meessen <christophe.meessen@gmail.com>
Christopher Waldon <christopher.waldon.dev@gmail.com>
Clayton Northey <clayton.northey@gmail.com>
Dan Kortschak <dan.kortschak@adelaide.edu.au> <dan@kortschak.io>
Daniel Fireman <danielfireman@gmail.com>
Dario Heinisch <dario.heinisch@gmail.com>
David Kleiven <davidkleiven446@gmail.com>
David Samborski <bloggingarrow@gmail.com>
Davor Kapsa <davor.kapsa@gmail.com>
DeepMind Technologies
Delaney Gillilan <delaneygillilan@gmail.com>
Dezmond Goff <goff.dezmond@gmail.com>
Dong-hee Na <donghee.na92@gmail.com>
Dustin Spicuzza <dustin@virtualroadside.com>
Egon Elbre <egonelbre@gmail.com>
Ekaterina Efimova <katerina.efimova@gmail.com>
Eng Zer Jun <engzerjun@gmail.com>
Ethan Burns <burns.ethan@gmail.com>
Ethan Reesor <ethan.reesor@gmail.com>
Evert Lammerts <evert.lammerts@gmail.com>
Evgeny Savinov <notime.sea@gmail.com>
Fabian Wickborn <fabian@wickborn.net>
Facundo Gaich <facugaich@gmail.com>
Fazlul Shahriar <fshahriar@gmail.com>
Francesc Campoy <campoy@golang.org>
Google Inc
Gustaf Johansson <gustaf@pinon.se>
Hossein Zolfi <hossein.zolfi@gmail.com>
Iakov Davydov <iakov.davydov@unil.ch>
Igor Mikushkin <igor.mikushkin@gmail.com>
Iskander Sharipov <quasilyte@gmail.com>
Jalem Raj Rohit <jrajrohit33@gmail.com>
James Bell <james@stellentus.com>
James Bowman <james.edward.bowman@gmail.com>
James Holmes <32bitkid@gmail.com>
Janne Snabb <snabb@epipe.com>
Jeremy Atkinson <jchatkinson@gmail.com>
Jes Cok <xigua67damn@gmail.com>
Jinesi Yelizati <i63888888@163.com>
Jonas Kahler <jonas@derkahler.de>
Jonas Schulze <jonas.schulze@ovgu.de>
Jonathan Bluett-Duncan <jbluettduncan@gmail.com>
Jonathan J Lawlor <jonathan.lawlor@gmail.com>
Jonathan Reiter <jonreiter@gmail.com>
Jonathan Schroeder <jd.schroeder@gmail.com>
Joost van Amersfoort <git@joo.st>
Jordan Stoker <jordan_stoker@hotmail.com>
Joseph Watson <jtwatson@linux-consulting.us>
Josh Wilson <josh.craig.wilson@gmail.com>
Julien Roland <juroland@gmail.com>
Kai Trukenmüller <ktye78@gmail.com>
Kent English <kent.english@gmail.com>
Kevin C. Zimmerman <kevinczimmerman@gmail.com>
Kirill Motkov <motkov.kirill@gmail.com>
Konstantin Shaposhnikov <k.shaposhnikov@gmail.com>
Leonid Kneller <recondite.matter@gmail.com>
Lyron Winderbaum <lyron.winderbaum@student.adelaide.edu.au> <armadilloa16@gmail.com> <lyron.winderbaum@uwa.edu.au>
Marco Leogrande <dark.knight.ita@gmail.com>
Mark Canning <argusdusty@gmail.com>
Mark Skilbeck <markskilbeck@gmail.com>
Martin Diz <github@martindiz.com.ar>
Matthew Connelly <matthew.b.connelly@gmail.com>
Matthieu Di Mercurio <matthieu.dimercurio@gmail.com>
Max Halford <maxhalford25@gmail.com>
Maxim Sergeev <gudvinr@gmail.com>
Microsoft Corporation
MinJae Kwon <k239507@gmail.com>
Nathan Edwards <etaoinshrdluwho@gmail.com>
Nick Potts <nick@the-potts.com>
Nils Wogatzky <odog@netcologne.de>
Olivier Wulveryck <olivier.wulveryck@gmail.com>
Or Rikon <rikonor@gmail.com>
Patricio Whittingslow <graded.sp@gmail.com>
Patrick DeVivo <patrick@tickgit.com>
Pontus Melke <pontusmelke@gmail.com>
Renee French
Rishi Desai <desai.rishi1@gmail.com>
Robin Eklind <r.eklind.87@gmail.com>
Roger Welin <roger.welin@icloud.com>
Rondall Jones <rejones7@gmail.com>
Sam Zaydel <szaydel@gmail.com>
Samuel Kelemen <Samuel@Kelemen.us>
Saran Ahluwalia <ahlusar.ahluwalia@gmail.com>
Scott Holden <scott@sshconnection.com>
Scott Kiesel <kiesel.scott@gmail.com>
Sebastien Binet <seb.binet@gmail.com>
Shawn Smith <shawnpsmith@gmail.com>
Sintela Ltd
source{d} <hello@sourced.tech>
Spencer Lyon <spencerlyon2@gmail.com>
Steve McCoy <mccoyst@gmail.com>
Taesu Pyo <pyotaesu@gmail.com>
Takeshi Yoneda <cz.rk.t0415y.g@gmail.com>
Tamir Hyman <hyman.tamir@gmail.com>
The University of Adelaide
The University of Minnesota
The University of Washington
Thomas Berg <tomfuture@gmail.com>
Tobin Harding <me@tobin.cc>
Valentin Deleplace <deleplace2015@gmail.com>
Vincent Thiery <vjmthiery@gmail.com>
Vladimír Chalupecký <vladimir.chalupecky@gmail.com>
Will Tekulve <tekulve.will@gmail.com>
Yasuhiro Matsumoto <mattn.jp@gmail.com>
Yevgeniy Vahlis <evahlis@gmail.com>
Yucheng Zhu <zyctc000@gmail.com>
Yunomi <ynmtywn@gmail.com>
Zoe Juozapaitis

135
vendor/gonum.org/v1/gonum/CONTRIBUTORS generated vendored Normal file
View File

@@ -0,0 +1,135 @@
# This is the official list of people who can contribute
# (and typically have contributed) code to the Gonum
# project.
#
# The AUTHORS file lists the copyright holders; this file
# lists people. For example, Google employees would be listed here
# but not in AUTHORS, because Google would hold the copyright.
#
# When adding J Random Contributor's name to this file,
# either J's name or J's organization's name should be
# added to the AUTHORS file.
#
# Names should be added to this file like so:
# Name <email address>
#
# Please keep the list sorted.
Alexander Egurnov <alexander.egurnov@gmail.com>
Andrei Blinnikov <goofinator@mail.ru>
Andrew Brampton <brampton@gmail.com>
antichris <chris@u-d13.com>
Bailey Lissington <lissington4@gmail.com>
Bill Gray <wgray@gogray.com>
Bill Noon <noon.bill@gmail.com>
Brendan Tracey <tracey.brendan@gmail.com>
Brent Pedersen <bpederse@gmail.com>
Bulat Khasanov <afti@yandex.ru>
Chad Kunde <kunde21@gmail.com>
Chan Kwan Yin <sofe2038@gmail.com>
Chih-Wei Chang <bert.cwchang@gmail.com>
Chong-Yeol Nah <nahchongyeol@gmail.com>
Chris Tessum <ctessum@gmail.com>
Christophe Meessen <christophe.meessen@gmail.com>
Christopher Waldon <christopher.waldon.dev@gmail.com>
Clayton Northey <clayton.northey@gmail.com>
Dan Kortschak <dan.kortschak@adelaide.edu.au> <dan@kortschak.io>
Dan Lorenc <lorenc.d@gmail.com>
Daniel Fireman <danielfireman@gmail.com>
Dario Heinisch <dario.heinisch@gmail.com>
David Kleiven <davidkleiven446@gmail.com>
David Samborski <bloggingarrow@gmail.com>
Davor Kapsa <davor.kapsa@gmail.com>
Delaney Gillilan <delaneygillilan@gmail.com>
Dezmond Goff <goff.dezmond@gmail.com>
Dong-hee Na <donghee.na92@gmail.com>
Dustin Spicuzza <dustin@virtualroadside.com>
Egon Elbre <egonelbre@gmail.com>
Ekaterina Efimova <katerina.efimova@gmail.com>
Eng Zer Jun <engzerjun@gmail.com>
Ethan Burns <burns.ethan@gmail.com>
Ethan Reesor <ethan.reesor@gmail.com>
Evert Lammerts <evert.lammerts@gmail.com>
Evgeny Savinov <notime.sea@gmail.com>
Fabian Wickborn <fabian@wickborn.net>
Facundo Gaich <facugaich@gmail.com>
Fazlul Shahriar <fshahriar@gmail.com>
Francesc Campoy <campoy@golang.org>
Gustaf Johansson <gustaf@pinon.se>
Hossein Zolfi <hossein.zolfi@gmail.com>
Iakov Davydov <iakov.davydov@unil.ch>
Igor Mikushkin <igor.mikushkin@gmail.com>
Iskander Sharipov <quasilyte@gmail.com>
Jalem Raj Rohit <jrajrohit33@gmail.com>
James Bell <james@stellentus.com>
James Bowman <james.edward.bowman@gmail.com>
James Holmes <32bitkid@gmail.com>
Janne Snabb <snabb@epipe.com>
Jeremy Atkinson <jchatkinson@gmail.com>
Jes Cok <xigua67damn@gmail.com>
Jinesi Yelizati <i63888888@163.com>
Jon Richards <noj.richards@gmail.com>
Jonas Kahler <jonas@derkahler.de>
Jonas Schulze <jonas.schulze@ovgu.de>
Jonathan Bluett-Duncan <jbluettduncan@gmail.com>
Jonathan J Lawlor <jonathan.lawlor@gmail.com>
Jonathan Reiter <jonreiter@gmail.com>
Jonathan Schroeder <jd.schroeder@gmail.com>
Joost van Amersfoort <git@joo.st>
Jordan Stoker <jordan_stoker@hotmail.com>
Joseph Watson <jtwatson@linux-consulting.us>
Josh Wilson <josh.craig.wilson@gmail.com>
Julien Roland <juroland@gmail.com>
Kai Trukenmüller <ktye78@gmail.com>
Kent English <kent.english@gmail.com>
Kevin C. Zimmerman <kevinczimmerman@gmail.com>
Kirill Motkov <motkov.kirill@gmail.com>
Konstantin Shaposhnikov <k.shaposhnikov@gmail.com>
Leonid Kneller <recondite.matter@gmail.com>
Lyron Winderbaum <lyron.winderbaum@student.adelaide.edu.au> <armadilloa16@gmail.com> <lyron.winderbaum@uwa.edu.au>
Marco Leogrande <dark.knight.ita@gmail.com>
Mark Canning <argusdusty@gmail.com>
Mark Skilbeck <markskilbeck@gmail.com>
Martin Diz <github@martindiz.com.ar>
Matthew Connelly <matthew.b.connelly@gmail.com>
Matthieu Di Mercurio <matthieu.dimercurio@gmail.com>
Max Halford <maxhalford25@gmail.com>
Maxim Sergeev <gudvinr@gmail.com>
MinJae Kwon <k239507@gmail.com>
Nathan Edwards <etaoinshrdluwho@gmail.com>
Nick Potts <nick@the-potts.com>
Nils Wogatzky <odog@netcologne.de>
Olivier Wulveryck <olivier.wulveryck@gmail.com>
Or Rikon <rikonor@gmail.com>
Patricio Whittingslow <graded.sp@gmail.com>
Patrick DeVivo <patrick@tickgit.com>
Pontus Melke <pontusmelke@gmail.com>
Renee French
Rishi Desai <desai.rishi1@gmail.com>
Robin Eklind <r.eklind.87@gmail.com>
Roger Welin <roger.welin@icloud.com>
Roman Werpachowski <roman.werpachowski@gmail.com>
Rondall Jones <rejones7@gmail.com>
Sam Zaydel <szaydel@gmail.com>
Samuel Kelemen <Samuel@Kelemen.us>
Saran Ahluwalia <ahlusar.ahluwalia@gmail.com>
Scott Holden <scott@sshconnection.com>
Scott Kiesel <kiesel.scott@gmail.com>
Sebastien Binet <seb.binet@gmail.com>
Shawn Smith <shawnpsmith@gmail.com>
Spencer Lyon <spencerlyon2@gmail.com>
Steve McCoy <mccoyst@gmail.com>
Taesu Pyo <pyotaesu@gmail.com>
Takeshi Yoneda <cz.rk.t0415y.g@gmail.com>
Tamir Hyman <hyman.tamir@gmail.com>
Thomas Berg <tomfuture@gmail.com>
Tobin Harding <me@tobin.cc>
Valentin Deleplace <deleplace2015@gmail.com>
Vincent Thiery <vjmthiery@gmail.com>
Vladimír Chalupecký <vladimir.chalupecky@gmail.com>
Will Tekulve <tekulve.will@gmail.com>
Yasuhiro Matsumoto <mattn.jp@gmail.com>
Yevgeniy Vahlis <evahlis@gmail.com>
Yucheng Zhu <zyctc000@gmail.com>
Yunomi <ynmtywn@gmail.com>
Zoe Juozapaitis

23
vendor/gonum.org/v1/gonum/LICENSE generated vendored Normal file
View File

@@ -0,0 +1,23 @@
Copyright ©2013 The Gonum Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the Gonum project nor the names of its authors and
contributors may be used to endorse or promote products derived from this
software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

51
vendor/gonum.org/v1/gonum/blas/README.md generated vendored Normal file
View File

@@ -0,0 +1,51 @@
# Gonum BLAS
[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/blas)](https://pkg.go.dev/gonum.org/v1/gonum/blas)
[![GoDoc](https://godocs.io/gonum.org/v1/gonum/blas?status.svg)](https://godocs.io/gonum.org/v1/gonum/blas)
A collection of packages to provide BLAS functionality for the [Go programming
language](http://golang.org)
## Installation
```sh
go get gonum.org/v1/gonum/blas/...
```
## Packages
### blas
Defines [BLAS API](http://www.netlib.org/blas/blast-forum/cinterface.pdf) split in several
interfaces.
### blas/gonum
Go implementation of the BLAS API (incomplete, implements the `float32` and `float64` API).
### blas/blas64 and blas/blas32
Wrappers for an implementation of the double (i.e., `float64`) and single (`float32`)
precision real parts of the BLAS API.
```Go
package main
import (
"fmt"
"gonum.org/v1/gonum/blas/blas64"
)
func main() {
v := blas64.Vector{Inc: 1, Data: []float64{1, 1, 1}}
v.N = len(v.Data)
fmt.Println("v has length:", blas64.Nrm2(v))
}
```
### blas/cblas128 and blas/cblas64
Wrappers for an implementation of the double (i.e., `complex128`) and single (`complex64`)
precision complex parts of the blas API.
Currently blas/cblas64 and blas/cblas128 require gonum.org/v1/netlib/blas.

283
vendor/gonum.org/v1/gonum/blas/blas.go generated vendored Normal file
View File

@@ -0,0 +1,283 @@
// Copyright ©2013 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate ./conversions.bash
package blas
// Flag constants indicate Givens transformation H matrix state.
type Flag int
const (
Identity Flag = -2 // H is the identity matrix; no rotation is needed.
Rescaling Flag = -1 // H specifies rescaling.
OffDiagonal Flag = 0 // Off-diagonal elements of H are non-unit.
Diagonal Flag = 1 // Diagonal elements of H are non-unit.
)
// SrotmParams contains Givens transformation parameters returned
// by the Float32 Srotm method.
type SrotmParams struct {
Flag
H [4]float32 // Column-major 2 by 2 matrix.
}
// DrotmParams contains Givens transformation parameters returned
// by the Float64 Drotm method.
type DrotmParams struct {
Flag
H [4]float64 // Column-major 2 by 2 matrix.
}
// Transpose specifies the transposition operation of a matrix.
type Transpose byte
const (
NoTrans Transpose = 'N'
Trans Transpose = 'T'
ConjTrans Transpose = 'C'
)
// Uplo specifies whether a matrix is upper or lower triangular.
type Uplo byte
const (
Upper Uplo = 'U'
Lower Uplo = 'L'
All Uplo = 'A'
)
// Diag specifies whether a matrix is unit triangular.
type Diag byte
const (
NonUnit Diag = 'N'
Unit Diag = 'U'
)
// Side specifies from which side a multiplication operation is performed.
type Side byte
const (
Left Side = 'L'
Right Side = 'R'
)
// Float32 implements the single precision real BLAS routines.
type Float32 interface {
Float32Level1
Float32Level2
Float32Level3
}
// Float32Level1 implements the single precision real BLAS Level 1 routines.
type Float32Level1 interface {
Sdsdot(n int, alpha float32, x []float32, incX int, y []float32, incY int) float32
Dsdot(n int, x []float32, incX int, y []float32, incY int) float64
Sdot(n int, x []float32, incX int, y []float32, incY int) float32
Snrm2(n int, x []float32, incX int) float32
Sasum(n int, x []float32, incX int) float32
Isamax(n int, x []float32, incX int) int
Sswap(n int, x []float32, incX int, y []float32, incY int)
Scopy(n int, x []float32, incX int, y []float32, incY int)
Saxpy(n int, alpha float32, x []float32, incX int, y []float32, incY int)
Srotg(a, b float32) (c, s, r, z float32)
Srotmg(d1, d2, b1, b2 float32) (p SrotmParams, rd1, rd2, rb1 float32)
Srot(n int, x []float32, incX int, y []float32, incY int, c, s float32)
Srotm(n int, x []float32, incX int, y []float32, incY int, p SrotmParams)
Sscal(n int, alpha float32, x []float32, incX int)
}
// Float32Level2 implements the single precision real BLAS Level 2 routines.
type Float32Level2 interface {
Sgemv(tA Transpose, m, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
Sgbmv(tA Transpose, m, n, kL, kU int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
Strmv(ul Uplo, tA Transpose, d Diag, n int, a []float32, lda int, x []float32, incX int)
Stbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []float32, lda int, x []float32, incX int)
Stpmv(ul Uplo, tA Transpose, d Diag, n int, ap []float32, x []float32, incX int)
Strsv(ul Uplo, tA Transpose, d Diag, n int, a []float32, lda int, x []float32, incX int)
Stbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []float32, lda int, x []float32, incX int)
Stpsv(ul Uplo, tA Transpose, d Diag, n int, ap []float32, x []float32, incX int)
Ssymv(ul Uplo, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
Ssbmv(ul Uplo, n, k int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
Sspmv(ul Uplo, n int, alpha float32, ap []float32, x []float32, incX int, beta float32, y []float32, incY int)
Sger(m, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int)
Ssyr(ul Uplo, n int, alpha float32, x []float32, incX int, a []float32, lda int)
Sspr(ul Uplo, n int, alpha float32, x []float32, incX int, ap []float32)
Ssyr2(ul Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int)
Sspr2(ul Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32)
}
// Float32Level3 implements the single precision real BLAS Level 3 routines.
type Float32Level3 interface {
Sgemm(tA, tB Transpose, m, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
Ssymm(s Side, ul Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
Ssyrk(ul Uplo, t Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int)
Ssyr2k(ul Uplo, t Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
Strmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int)
Strsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int)
}
// Float64 implements the single precision real BLAS routines.
type Float64 interface {
Float64Level1
Float64Level2
Float64Level3
}
// Float64Level1 implements the double precision real BLAS Level 1 routines.
type Float64Level1 interface {
Ddot(n int, x []float64, incX int, y []float64, incY int) float64
Dnrm2(n int, x []float64, incX int) float64
Dasum(n int, x []float64, incX int) float64
Idamax(n int, x []float64, incX int) int
Dswap(n int, x []float64, incX int, y []float64, incY int)
Dcopy(n int, x []float64, incX int, y []float64, incY int)
Daxpy(n int, alpha float64, x []float64, incX int, y []float64, incY int)
Drotg(a, b float64) (c, s, r, z float64)
Drotmg(d1, d2, b1, b2 float64) (p DrotmParams, rd1, rd2, rb1 float64)
Drot(n int, x []float64, incX int, y []float64, incY int, c float64, s float64)
Drotm(n int, x []float64, incX int, y []float64, incY int, p DrotmParams)
Dscal(n int, alpha float64, x []float64, incX int)
}
// Float64Level2 implements the double precision real BLAS Level 2 routines.
type Float64Level2 interface {
Dgemv(tA Transpose, m, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
Dgbmv(tA Transpose, m, n, kL, kU int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
Dtrmv(ul Uplo, tA Transpose, d Diag, n int, a []float64, lda int, x []float64, incX int)
Dtbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []float64, lda int, x []float64, incX int)
Dtpmv(ul Uplo, tA Transpose, d Diag, n int, ap []float64, x []float64, incX int)
Dtrsv(ul Uplo, tA Transpose, d Diag, n int, a []float64, lda int, x []float64, incX int)
Dtbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []float64, lda int, x []float64, incX int)
Dtpsv(ul Uplo, tA Transpose, d Diag, n int, ap []float64, x []float64, incX int)
Dsymv(ul Uplo, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
Dsbmv(ul Uplo, n, k int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
Dspmv(ul Uplo, n int, alpha float64, ap []float64, x []float64, incX int, beta float64, y []float64, incY int)
Dger(m, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int)
Dsyr(ul Uplo, n int, alpha float64, x []float64, incX int, a []float64, lda int)
Dspr(ul Uplo, n int, alpha float64, x []float64, incX int, ap []float64)
Dsyr2(ul Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int)
Dspr2(ul Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64)
}
// Float64Level3 implements the double precision real BLAS Level 3 routines.
type Float64Level3 interface {
Dgemm(tA, tB Transpose, m, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
Dsymm(s Side, ul Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
Dsyrk(ul Uplo, t Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int)
Dsyr2k(ul Uplo, t Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
Dtrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int)
Dtrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int)
}
// Complex64 implements the single precision complex BLAS routines.
type Complex64 interface {
Complex64Level1
Complex64Level2
Complex64Level3
}
// Complex64Level1 implements the single precision complex BLAS Level 1 routines.
type Complex64Level1 interface {
Cdotu(n int, x []complex64, incX int, y []complex64, incY int) (dotu complex64)
Cdotc(n int, x []complex64, incX int, y []complex64, incY int) (dotc complex64)
Scnrm2(n int, x []complex64, incX int) float32
Scasum(n int, x []complex64, incX int) float32
Icamax(n int, x []complex64, incX int) int
Cswap(n int, x []complex64, incX int, y []complex64, incY int)
Ccopy(n int, x []complex64, incX int, y []complex64, incY int)
Caxpy(n int, alpha complex64, x []complex64, incX int, y []complex64, incY int)
Cscal(n int, alpha complex64, x []complex64, incX int)
Csscal(n int, alpha float32, x []complex64, incX int)
}
// Complex64Level2 implements the single precision complex BLAS routines Level 2 routines.
type Complex64Level2 interface {
Cgemv(tA Transpose, m, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
Cgbmv(tA Transpose, m, n, kL, kU int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
Ctrmv(ul Uplo, tA Transpose, d Diag, n int, a []complex64, lda int, x []complex64, incX int)
Ctbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex64, lda int, x []complex64, incX int)
Ctpmv(ul Uplo, tA Transpose, d Diag, n int, ap []complex64, x []complex64, incX int)
Ctrsv(ul Uplo, tA Transpose, d Diag, n int, a []complex64, lda int, x []complex64, incX int)
Ctbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex64, lda int, x []complex64, incX int)
Ctpsv(ul Uplo, tA Transpose, d Diag, n int, ap []complex64, x []complex64, incX int)
Chemv(ul Uplo, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
Chbmv(ul Uplo, n, k int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
Chpmv(ul Uplo, n int, alpha complex64, ap []complex64, x []complex64, incX int, beta complex64, y []complex64, incY int)
Cgeru(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
Cgerc(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
Cher(ul Uplo, n int, alpha float32, x []complex64, incX int, a []complex64, lda int)
Chpr(ul Uplo, n int, alpha float32, x []complex64, incX int, a []complex64)
Cher2(ul Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
Chpr2(ul Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, ap []complex64)
}
// Complex64Level3 implements the single precision complex BLAS Level 3 routines.
type Complex64Level3 interface {
Cgemm(tA, tB Transpose, m, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
Csymm(s Side, ul Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
Csyrk(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, beta complex64, c []complex64, ldc int)
Csyr2k(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
Ctrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int)
Ctrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int)
Chemm(s Side, ul Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
Cherk(ul Uplo, t Transpose, n, k int, alpha float32, a []complex64, lda int, beta float32, c []complex64, ldc int)
Cher2k(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta float32, c []complex64, ldc int)
}
// Complex128 implements the double precision complex BLAS routines.
type Complex128 interface {
Complex128Level1
Complex128Level2
Complex128Level3
}
// Complex128Level1 implements the double precision complex BLAS Level 1 routines.
type Complex128Level1 interface {
Zdotu(n int, x []complex128, incX int, y []complex128, incY int) (dotu complex128)
Zdotc(n int, x []complex128, incX int, y []complex128, incY int) (dotc complex128)
Dznrm2(n int, x []complex128, incX int) float64
Dzasum(n int, x []complex128, incX int) float64
Izamax(n int, x []complex128, incX int) int
Zswap(n int, x []complex128, incX int, y []complex128, incY int)
Zcopy(n int, x []complex128, incX int, y []complex128, incY int)
Zaxpy(n int, alpha complex128, x []complex128, incX int, y []complex128, incY int)
Zscal(n int, alpha complex128, x []complex128, incX int)
Zdscal(n int, alpha float64, x []complex128, incX int)
}
// Complex128Level2 implements the double precision complex BLAS Level 2 routines.
type Complex128Level2 interface {
Zgemv(tA Transpose, m, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
Zgbmv(tA Transpose, m, n int, kL int, kU int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
Ztrmv(ul Uplo, tA Transpose, d Diag, n int, a []complex128, lda int, x []complex128, incX int)
Ztbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex128, lda int, x []complex128, incX int)
Ztpmv(ul Uplo, tA Transpose, d Diag, n int, ap []complex128, x []complex128, incX int)
Ztrsv(ul Uplo, tA Transpose, d Diag, n int, a []complex128, lda int, x []complex128, incX int)
Ztbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex128, lda int, x []complex128, incX int)
Ztpsv(ul Uplo, tA Transpose, d Diag, n int, ap []complex128, x []complex128, incX int)
Zhemv(ul Uplo, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
Zhbmv(ul Uplo, n, k int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
Zhpmv(ul Uplo, n int, alpha complex128, ap []complex128, x []complex128, incX int, beta complex128, y []complex128, incY int)
Zgeru(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
Zgerc(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
Zher(ul Uplo, n int, alpha float64, x []complex128, incX int, a []complex128, lda int)
Zhpr(ul Uplo, n int, alpha float64, x []complex128, incX int, a []complex128)
Zher2(ul Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
Zhpr2(ul Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, ap []complex128)
}
// Complex128Level3 implements the double precision complex BLAS Level 3 routines.
type Complex128Level3 interface {
Zgemm(tA, tB Transpose, m, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
Zsymm(s Side, ul Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
Zsyrk(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, beta complex128, c []complex128, ldc int)
Zsyr2k(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
Ztrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int)
Ztrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int)
Zhemm(s Side, ul Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
Zherk(ul Uplo, t Transpose, n, k int, alpha float64, a []complex128, lda int, beta float64, c []complex128, ldc int)
Zher2k(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta float64, c []complex128, ldc int)
}

533
vendor/gonum.org/v1/gonum/blas/blas64/blas64.go generated vendored Normal file
View File

@@ -0,0 +1,533 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blas64
import (
"gonum.org/v1/gonum/blas"
"gonum.org/v1/gonum/blas/gonum"
)
var blas64 blas.Float64 = gonum.Implementation{}
// Use sets the BLAS float64 implementation to be used by subsequent BLAS calls.
// The default implementation is
// gonum.org/v1/gonum/blas/gonum.Implementation.
func Use(b blas.Float64) {
blas64 = b
}
// Implementation returns the current BLAS float64 implementation.
//
// Implementation allows direct calls to the current BLAS float64 implementation
// giving finer control of parameters.
func Implementation() blas.Float64 {
return blas64
}
// Vector represents a vector with an associated element increment.
type Vector struct {
N int
Data []float64
Inc int
}
// General represents a matrix using the conventional storage scheme.
type General struct {
Rows, Cols int
Data []float64
Stride int
}
// Band represents a band matrix using the band storage scheme.
type Band struct {
Rows, Cols int
KL, KU int
Data []float64
Stride int
}
// Triangular represents a triangular matrix using the conventional storage scheme.
type Triangular struct {
Uplo blas.Uplo
Diag blas.Diag
N int
Data []float64
Stride int
}
// TriangularBand represents a triangular matrix using the band storage scheme.
type TriangularBand struct {
Uplo blas.Uplo
Diag blas.Diag
N, K int
Data []float64
Stride int
}
// TriangularPacked represents a triangular matrix using the packed storage scheme.
type TriangularPacked struct {
Uplo blas.Uplo
Diag blas.Diag
N int
Data []float64
}
// Symmetric represents a symmetric matrix using the conventional storage scheme.
type Symmetric struct {
Uplo blas.Uplo
N int
Data []float64
Stride int
}
// SymmetricBand represents a symmetric matrix using the band storage scheme.
type SymmetricBand struct {
Uplo blas.Uplo
N, K int
Data []float64
Stride int
}
// SymmetricPacked represents a symmetric matrix using the packed storage scheme.
type SymmetricPacked struct {
Uplo blas.Uplo
N int
Data []float64
}
// Level 1
const (
negInc = "blas64: negative vector increment"
badLength = "blas64: vector length mismatch"
)
// Dot computes the dot product of the two vectors:
//
// \sum_i x[i]*y[i].
//
// Dot will panic if the lengths of x and y do not match.
func Dot(x, y Vector) float64 {
if x.N != y.N {
panic(badLength)
}
return blas64.Ddot(x.N, x.Data, x.Inc, y.Data, y.Inc)
}
// Nrm2 computes the Euclidean norm of the vector x:
//
// sqrt(\sum_i x[i]*x[i]).
//
// Nrm2 will panic if the vector increment is negative.
func Nrm2(x Vector) float64 {
if x.Inc < 0 {
panic(negInc)
}
return blas64.Dnrm2(x.N, x.Data, x.Inc)
}
// Asum computes the sum of the absolute values of the elements of x:
//
// \sum_i |x[i]|.
//
// Asum will panic if the vector increment is negative.
func Asum(x Vector) float64 {
if x.Inc < 0 {
panic(negInc)
}
return blas64.Dasum(x.N, x.Data, x.Inc)
}
// Iamax returns the index of an element of x with the largest absolute value.
// If there are multiple such indices the earliest is returned.
// Iamax returns -1 if n == 0.
//
// Iamax will panic if the vector increment is negative.
func Iamax(x Vector) int {
if x.Inc < 0 {
panic(negInc)
}
return blas64.Idamax(x.N, x.Data, x.Inc)
}
// Swap exchanges the elements of the two vectors:
//
// x[i], y[i] = y[i], x[i] for all i.
//
// Swap will panic if the lengths of x and y do not match.
func Swap(x, y Vector) {
if x.N != y.N {
panic(badLength)
}
blas64.Dswap(x.N, x.Data, x.Inc, y.Data, y.Inc)
}
// Copy copies the elements of x into the elements of y:
//
// y[i] = x[i] for all i.
//
// Copy will panic if the lengths of x and y do not match.
func Copy(x, y Vector) {
if x.N != y.N {
panic(badLength)
}
blas64.Dcopy(x.N, x.Data, x.Inc, y.Data, y.Inc)
}
// Axpy adds x scaled by alpha to y:
//
// y[i] += alpha*x[i] for all i.
//
// Axpy will panic if the lengths of x and y do not match.
func Axpy(alpha float64, x, y Vector) {
if x.N != y.N {
panic(badLength)
}
blas64.Daxpy(x.N, alpha, x.Data, x.Inc, y.Data, y.Inc)
}
// Rotg computes the parameters of a Givens plane rotation so that
//
// ⎡ c s⎤ ⎡a⎤ ⎡r⎤
// ⎣-s c⎦ * ⎣b⎦ = ⎣0⎦
//
// where a and b are the Cartesian coordinates of a given point.
// c, s, and r are defined as
//
// r = ±Sqrt(a^2 + b^2),
// c = a/r, the cosine of the rotation angle,
// s = a/r, the sine of the rotation angle,
//
// and z is defined such that
//
// if |a| > |b|, z = s,
// otherwise if c != 0, z = 1/c,
// otherwise z = 1.
func Rotg(a, b float64) (c, s, r, z float64) {
return blas64.Drotg(a, b)
}
// Rotmg computes the modified Givens rotation. See
// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
// for more details.
func Rotmg(d1, d2, b1, b2 float64) (p blas.DrotmParams, rd1, rd2, rb1 float64) {
return blas64.Drotmg(d1, d2, b1, b2)
}
// Rot applies a plane transformation to n points represented by the vectors x
// and y:
//
// x[i] = c*x[i] + s*y[i],
// y[i] = -s*x[i] + c*y[i], for all i.
func Rot(x, y Vector, c, s float64) {
if x.N != y.N {
panic(badLength)
}
blas64.Drot(x.N, x.Data, x.Inc, y.Data, y.Inc, c, s)
}
// Rotm applies the modified Givens rotation to n points represented by the
// vectors x and y.
func Rotm(x, y Vector, p blas.DrotmParams) {
if x.N != y.N {
panic(badLength)
}
blas64.Drotm(x.N, x.Data, x.Inc, y.Data, y.Inc, p)
}
// Scal scales the vector x by alpha:
//
// x[i] *= alpha for all i.
//
// Scal will panic if the vector increment is negative.
func Scal(alpha float64, x Vector) {
if x.Inc < 0 {
panic(negInc)
}
blas64.Dscal(x.N, alpha, x.Data, x.Inc)
}
// Level 2
// Gemv computes
//
// y = alpha * A * x + beta * y if t == blas.NoTrans,
// y = alpha * Aᵀ * x + beta * y if t == blas.Trans or blas.ConjTrans,
//
// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
func Gemv(t blas.Transpose, alpha float64, a General, x Vector, beta float64, y Vector) {
blas64.Dgemv(t, a.Rows, a.Cols, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
}
// Gbmv computes
//
// y = alpha * A * x + beta * y if t == blas.NoTrans,
// y = alpha * Aᵀ * x + beta * y if t == blas.Trans or blas.ConjTrans,
//
// where A is an m×n band matrix, x and y are vectors, and alpha and beta are scalars.
func Gbmv(t blas.Transpose, alpha float64, a Band, x Vector, beta float64, y Vector) {
blas64.Dgbmv(t, a.Rows, a.Cols, a.KL, a.KU, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
}
// Trmv computes
//
// x = A * x if t == blas.NoTrans,
// x = Aᵀ * x if t == blas.Trans or blas.ConjTrans,
//
// where A is an n×n triangular matrix, and x is a vector.
func Trmv(t blas.Transpose, a Triangular, x Vector) {
blas64.Dtrmv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
}
// Tbmv computes
//
// x = A * x if t == blas.NoTrans,
// x = Aᵀ * x if t == blas.Trans or blas.ConjTrans,
//
// where A is an n×n triangular band matrix, and x is a vector.
func Tbmv(t blas.Transpose, a TriangularBand, x Vector) {
blas64.Dtbmv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
}
// Tpmv computes
//
// x = A * x if t == blas.NoTrans,
// x = Aᵀ * x if t == blas.Trans or blas.ConjTrans,
//
// where A is an n×n triangular matrix in packed format, and x is a vector.
func Tpmv(t blas.Transpose, a TriangularPacked, x Vector) {
blas64.Dtpmv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
}
// Trsv solves
//
// A * x = b if t == blas.NoTrans,
// Aᵀ * x = b if t == blas.Trans or blas.ConjTrans,
//
// where A is an n×n triangular matrix, and x and b are vectors.
//
// At entry to the function, x contains the values of b, and the result is
// stored in-place into x.
//
// No test for singularity or near-singularity is included in this
// routine. Such tests must be performed before calling this routine.
func Trsv(t blas.Transpose, a Triangular, x Vector) {
blas64.Dtrsv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
}
// Tbsv solves
//
// A * x = b if t == blas.NoTrans,
// Aᵀ * x = b if t == blas.Trans or blas.ConjTrans,
//
// where A is an n×n triangular band matrix, and x and b are vectors.
//
// At entry to the function, x contains the values of b, and the result is
// stored in place into x.
//
// No test for singularity or near-singularity is included in this
// routine. Such tests must be performed before calling this routine.
func Tbsv(t blas.Transpose, a TriangularBand, x Vector) {
blas64.Dtbsv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
}
// Tpsv solves
//
// A * x = b if t == blas.NoTrans,
// Aᵀ * x = b if t == blas.Trans or blas.ConjTrans,
//
// where A is an n×n triangular matrix in packed format, and x and b are
// vectors.
//
// At entry to the function, x contains the values of b, and the result is
// stored in place into x.
//
// No test for singularity or near-singularity is included in this
// routine. Such tests must be performed before calling this routine.
func Tpsv(t blas.Transpose, a TriangularPacked, x Vector) {
blas64.Dtpsv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
}
// Symv computes
//
// y = alpha * A * x + beta * y,
//
// where A is an n×n symmetric matrix, x and y are vectors, and alpha and
// beta are scalars.
func Symv(alpha float64, a Symmetric, x Vector, beta float64, y Vector) {
blas64.Dsymv(a.Uplo, a.N, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
}
// Sbmv performs
//
// y = alpha * A * x + beta * y,
//
// where A is an n×n symmetric band matrix, x and y are vectors, and alpha
// and beta are scalars.
func Sbmv(alpha float64, a SymmetricBand, x Vector, beta float64, y Vector) {
blas64.Dsbmv(a.Uplo, a.N, a.K, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
}
// Spmv performs
//
// y = alpha * A * x + beta * y,
//
// where A is an n×n symmetric matrix in packed format, x and y are vectors,
// and alpha and beta are scalars.
func Spmv(alpha float64, a SymmetricPacked, x Vector, beta float64, y Vector) {
blas64.Dspmv(a.Uplo, a.N, alpha, a.Data, x.Data, x.Inc, beta, y.Data, y.Inc)
}
// Ger performs a rank-1 update
//
// A += alpha * x * yᵀ,
//
// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
func Ger(alpha float64, x, y Vector, a General) {
blas64.Dger(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
}
// Syr performs a rank-1 update
//
// A += alpha * x * xᵀ,
//
// where A is an n×n symmetric matrix, x is a vector, and alpha is a scalar.
func Syr(alpha float64, x Vector, a Symmetric) {
blas64.Dsyr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data, a.Stride)
}
// Spr performs the rank-1 update
//
// A += alpha * x * xᵀ,
//
// where A is an n×n symmetric matrix in packed format, x is a vector, and
// alpha is a scalar.
func Spr(alpha float64, x Vector, a SymmetricPacked) {
blas64.Dspr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data)
}
// Syr2 performs a rank-2 update
//
// A += alpha * x * yᵀ + alpha * y * xᵀ,
//
// where A is a symmetric n×n matrix, x and y are vectors, and alpha is a scalar.
func Syr2(alpha float64, x, y Vector, a Symmetric) {
blas64.Dsyr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
}
// Spr2 performs a rank-2 update
//
// A += alpha * x * yᵀ + alpha * y * xᵀ,
//
// where A is an n×n symmetric matrix in packed format, x and y are vectors,
// and alpha is a scalar.
func Spr2(alpha float64, x, y Vector, a SymmetricPacked) {
blas64.Dspr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data)
}
// Level 3
// Gemm computes
//
// C = alpha * A * B + beta * C,
//
// where A, B, and C are dense matrices, and alpha and beta are scalars.
// tA and tB specify whether A or B are transposed.
func Gemm(tA, tB blas.Transpose, alpha float64, a, b General, beta float64, c General) {
var m, n, k int
if tA == blas.NoTrans {
m, k = a.Rows, a.Cols
} else {
m, k = a.Cols, a.Rows
}
if tB == blas.NoTrans {
n = b.Cols
} else {
n = b.Rows
}
blas64.Dgemm(tA, tB, m, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
}
// Symm performs
//
// C = alpha * A * B + beta * C if s == blas.Left,
// C = alpha * B * A + beta * C if s == blas.Right,
//
// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and
// alpha is a scalar.
func Symm(s blas.Side, alpha float64, a Symmetric, b General, beta float64, c General) {
var m, n int
if s == blas.Left {
m, n = a.N, b.Cols
} else {
m, n = b.Rows, a.N
}
blas64.Dsymm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
}
// Syrk performs a symmetric rank-k update
//
// C = alpha * A * Aᵀ + beta * C if t == blas.NoTrans,
// C = alpha * Aᵀ * A + beta * C if t == blas.Trans or blas.ConjTrans,
//
// where C is an n×n symmetric matrix, A is an n×k matrix if t == blas.NoTrans and
// a k×n matrix otherwise, and alpha and beta are scalars.
func Syrk(t blas.Transpose, alpha float64, a General, beta float64, c Symmetric) {
var n, k int
if t == blas.NoTrans {
n, k = a.Rows, a.Cols
} else {
n, k = a.Cols, a.Rows
}
blas64.Dsyrk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
}
// Syr2k performs a symmetric rank-2k update
//
// C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C if t == blas.NoTrans,
// C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C if t == blas.Trans or blas.ConjTrans,
//
// where C is an n×n symmetric matrix, A and B are n×k matrices if t == NoTrans
// and k×n matrices otherwise, and alpha and beta are scalars.
func Syr2k(t blas.Transpose, alpha float64, a, b General, beta float64, c Symmetric) {
var n, k int
if t == blas.NoTrans {
n, k = a.Rows, a.Cols
} else {
n, k = a.Cols, a.Rows
}
blas64.Dsyr2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
}
// Trmm performs
//
// B = alpha * A * B if tA == blas.NoTrans and s == blas.Left,
// B = alpha * Aᵀ * B if tA == blas.Trans or blas.ConjTrans, and s == blas.Left,
// B = alpha * B * A if tA == blas.NoTrans and s == blas.Right,
// B = alpha * B * Aᵀ if tA == blas.Trans or blas.ConjTrans, and s == blas.Right,
//
// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is
// a scalar.
func Trmm(s blas.Side, tA blas.Transpose, alpha float64, a Triangular, b General) {
blas64.Dtrmm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
}
// Trsm solves
//
// A * X = alpha * B if tA == blas.NoTrans and s == blas.Left,
// Aᵀ * X = alpha * B if tA == blas.Trans or blas.ConjTrans, and s == blas.Left,
// X * A = alpha * B if tA == blas.NoTrans and s == blas.Right,
// X * Aᵀ = alpha * B if tA == blas.Trans or blas.ConjTrans, and s == blas.Right,
//
// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and
// alpha is a scalar.
//
// At entry to the function, X contains the values of B, and the result is
// stored in-place into X.
//
// No check is made that A is invertible.
func Trsm(s blas.Side, tA blas.Transpose, alpha float64, a Triangular, b General) {
blas64.Dtrsm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
}

277
vendor/gonum.org/v1/gonum/blas/blas64/conv.go generated vendored Normal file
View File

@@ -0,0 +1,277 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blas64
import "gonum.org/v1/gonum/blas"
// GeneralCols represents a matrix using the conventional column-major storage scheme.
type GeneralCols General
// From fills the receiver with elements from a. The receiver
// must have the same dimensions as a and have adequate backing
// data storage.
func (t GeneralCols) From(a General) {
if t.Rows != a.Rows || t.Cols != a.Cols {
panic("blas64: mismatched dimension")
}
if len(t.Data) < (t.Cols-1)*t.Stride+t.Rows {
panic("blas64: short data slice")
}
for i := 0; i < a.Rows; i++ {
for j, v := range a.Data[i*a.Stride : i*a.Stride+a.Cols] {
t.Data[i+j*t.Stride] = v
}
}
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions as a and have adequate backing
// data storage.
func (t General) From(a GeneralCols) {
if t.Rows != a.Rows || t.Cols != a.Cols {
panic("blas64: mismatched dimension")
}
if len(t.Data) < (t.Rows-1)*t.Stride+t.Cols {
panic("blas64: short data slice")
}
for j := 0; j < a.Cols; j++ {
for i, v := range a.Data[j*a.Stride : j*a.Stride+a.Rows] {
t.Data[i*t.Stride+j] = v
}
}
}
// TriangularCols represents a matrix using the conventional column-major storage scheme.
type TriangularCols Triangular
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, uplo and diag as a and have
// adequate backing data storage.
func (t TriangularCols) From(a Triangular) {
if t.N != a.N {
panic("blas64: mismatched dimension")
}
if t.Uplo != a.Uplo {
panic("blas64: mismatched BLAS uplo")
}
if t.Diag != a.Diag {
panic("blas64: mismatched BLAS diag")
}
switch a.Uplo {
default:
panic("blas64: bad BLAS uplo")
case blas.Upper:
for i := 0; i < a.N; i++ {
for j := i; j < a.N; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
case blas.Lower:
for i := 0; i < a.N; i++ {
for j := 0; j <= i; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
case blas.All:
for i := 0; i < a.N; i++ {
for j := 0; j < a.N; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
}
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, uplo and diag as a and have
// adequate backing data storage.
func (t Triangular) From(a TriangularCols) {
if t.N != a.N {
panic("blas64: mismatched dimension")
}
if t.Uplo != a.Uplo {
panic("blas64: mismatched BLAS uplo")
}
if t.Diag != a.Diag {
panic("blas64: mismatched BLAS diag")
}
switch a.Uplo {
default:
panic("blas64: bad BLAS uplo")
case blas.Upper:
for i := 0; i < a.N; i++ {
for j := i; j < a.N; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
case blas.Lower:
for i := 0; i < a.N; i++ {
for j := 0; j <= i; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
case blas.All:
for i := 0; i < a.N; i++ {
for j := 0; j < a.N; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
}
}
// BandCols represents a matrix using the band column-major storage scheme.
type BandCols Band
// From fills the receiver with elements from a. The receiver
// must have the same dimensions and bandwidth as a and have
// adequate backing data storage.
func (t BandCols) From(a Band) {
if t.Rows != a.Rows || t.Cols != a.Cols {
panic("blas64: mismatched dimension")
}
if t.KL != a.KL || t.KU != a.KU {
panic("blas64: mismatched bandwidth")
}
if a.Stride < a.KL+a.KU+1 {
panic("blas64: short stride for source")
}
if t.Stride < t.KL+t.KU+1 {
panic("blas64: short stride for destination")
}
for i := 0; i < a.Rows; i++ {
for j := max(0, i-a.KL); j < min(i+a.KU+1, a.Cols); j++ {
t.Data[i+t.KU-j+j*t.Stride] = a.Data[j+a.KL-i+i*a.Stride]
}
}
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions and bandwidth as a and have
// adequate backing data storage.
func (t Band) From(a BandCols) {
if t.Rows != a.Rows || t.Cols != a.Cols {
panic("blas64: mismatched dimension")
}
if t.KL != a.KL || t.KU != a.KU {
panic("blas64: mismatched bandwidth")
}
if a.Stride < a.KL+a.KU+1 {
panic("blas64: short stride for source")
}
if t.Stride < t.KL+t.KU+1 {
panic("blas64: short stride for destination")
}
for j := 0; j < a.Cols; j++ {
for i := max(0, j-a.KU); i < min(j+a.KL+1, a.Rows); i++ {
t.Data[j+a.KL-i+i*a.Stride] = a.Data[i+t.KU-j+j*t.Stride]
}
}
}
// TriangularBandCols represents a triangular matrix using the band column-major storage scheme.
type TriangularBandCols TriangularBand
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, bandwidth and uplo as a and
// have adequate backing data storage.
func (t TriangularBandCols) From(a TriangularBand) {
if t.N != a.N {
panic("blas64: mismatched dimension")
}
if t.K != a.K {
panic("blas64: mismatched bandwidth")
}
if a.Stride < a.K+1 {
panic("blas64: short stride for source")
}
if t.Stride < t.K+1 {
panic("blas64: short stride for destination")
}
if t.Uplo != a.Uplo {
panic("blas64: mismatched BLAS uplo")
}
if t.Diag != a.Diag {
panic("blas64: mismatched BLAS diag")
}
dst := BandCols{
Rows: t.N, Cols: t.N,
Stride: t.Stride,
Data: t.Data,
}
src := Band{
Rows: a.N, Cols: a.N,
Stride: a.Stride,
Data: a.Data,
}
switch a.Uplo {
default:
panic("blas64: bad BLAS uplo")
case blas.Upper:
dst.KU = t.K
src.KU = a.K
case blas.Lower:
dst.KL = t.K
src.KL = a.K
}
dst.From(src)
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, bandwidth and uplo as a and
// have adequate backing data storage.
func (t TriangularBand) From(a TriangularBandCols) {
if t.N != a.N {
panic("blas64: mismatched dimension")
}
if t.K != a.K {
panic("blas64: mismatched bandwidth")
}
if a.Stride < a.K+1 {
panic("blas64: short stride for source")
}
if t.Stride < t.K+1 {
panic("blas64: short stride for destination")
}
if t.Uplo != a.Uplo {
panic("blas64: mismatched BLAS uplo")
}
if t.Diag != a.Diag {
panic("blas64: mismatched BLAS diag")
}
dst := Band{
Rows: t.N, Cols: t.N,
Stride: t.Stride,
Data: t.Data,
}
src := BandCols{
Rows: a.N, Cols: a.N,
Stride: a.Stride,
Data: a.Data,
}
switch a.Uplo {
default:
panic("blas64: bad BLAS uplo")
case blas.Upper:
dst.KU = t.K
src.KU = a.K
case blas.Lower:
dst.KL = t.K
src.KL = a.K
}
dst.From(src)
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func max(a, b int) int {
if a > b {
return a
}
return b
}

153
vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go generated vendored Normal file
View File

@@ -0,0 +1,153 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package blas64
import "gonum.org/v1/gonum/blas"
// SymmetricCols represents a matrix using the conventional column-major storage scheme.
type SymmetricCols Symmetric
// From fills the receiver with elements from a. The receiver
// must have the same dimensions and uplo as a and have adequate
// backing data storage.
func (t SymmetricCols) From(a Symmetric) {
if t.N != a.N {
panic("blas64: mismatched dimension")
}
if t.Uplo != a.Uplo {
panic("blas64: mismatched BLAS uplo")
}
switch a.Uplo {
default:
panic("blas64: bad BLAS uplo")
case blas.Upper:
for i := 0; i < a.N; i++ {
for j := i; j < a.N; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
case blas.Lower:
for i := 0; i < a.N; i++ {
for j := 0; j <= i; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
}
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions and uplo as a and have adequate
// backing data storage.
func (t Symmetric) From(a SymmetricCols) {
if t.N != a.N {
panic("blas64: mismatched dimension")
}
if t.Uplo != a.Uplo {
panic("blas64: mismatched BLAS uplo")
}
switch a.Uplo {
default:
panic("blas64: bad BLAS uplo")
case blas.Upper:
for i := 0; i < a.N; i++ {
for j := i; j < a.N; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
case blas.Lower:
for i := 0; i < a.N; i++ {
for j := 0; j <= i; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
}
}
// SymmetricBandCols represents a symmetric matrix using the band column-major storage scheme.
type SymmetricBandCols SymmetricBand
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, bandwidth and uplo as a and
// have adequate backing data storage.
func (t SymmetricBandCols) From(a SymmetricBand) {
if t.N != a.N {
panic("blas64: mismatched dimension")
}
if t.K != a.K {
panic("blas64: mismatched bandwidth")
}
if a.Stride < a.K+1 {
panic("blas64: short stride for source")
}
if t.Stride < t.K+1 {
panic("blas64: short stride for destination")
}
if t.Uplo != a.Uplo {
panic("blas64: mismatched BLAS uplo")
}
dst := BandCols{
Rows: t.N, Cols: t.N,
Stride: t.Stride,
Data: t.Data,
}
src := Band{
Rows: a.N, Cols: a.N,
Stride: a.Stride,
Data: a.Data,
}
switch a.Uplo {
default:
panic("blas64: bad BLAS uplo")
case blas.Upper:
dst.KU = t.K
src.KU = a.K
case blas.Lower:
dst.KL = t.K
src.KL = a.K
}
dst.From(src)
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, bandwidth and uplo as a and
// have adequate backing data storage.
func (t SymmetricBand) From(a SymmetricBandCols) {
if t.N != a.N {
panic("blas64: mismatched dimension")
}
if t.K != a.K {
panic("blas64: mismatched bandwidth")
}
if a.Stride < a.K+1 {
panic("blas64: short stride for source")
}
if t.Stride < t.K+1 {
panic("blas64: short stride for destination")
}
if t.Uplo != a.Uplo {
panic("blas64: mismatched BLAS uplo")
}
dst := Band{
Rows: t.N, Cols: t.N,
Stride: t.Stride,
Data: t.Data,
}
src := BandCols{
Rows: a.N, Cols: a.N,
Stride: a.Stride,
Data: a.Data,
}
switch a.Uplo {
default:
panic("blas64: bad BLAS uplo")
case blas.Upper:
dst.KU = t.K
src.KU = a.K
case blas.Lower:
dst.KL = t.K
src.KL = a.K
}
dst.From(src)
}

6
vendor/gonum.org/v1/gonum/blas/blas64/doc.go generated vendored Normal file
View File

@@ -0,0 +1,6 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package blas64 provides a simple interface to the float64 BLAS API.
package blas64 // import "gonum.org/v1/gonum/blas/blas64"

600
vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go generated vendored Normal file
View File

@@ -0,0 +1,600 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cblas128
import (
"gonum.org/v1/gonum/blas"
"gonum.org/v1/gonum/blas/gonum"
)
var cblas128 blas.Complex128 = gonum.Implementation{}
// Use sets the BLAS complex128 implementation to be used by subsequent BLAS calls.
// The default implementation is
// gonum.org/v1/gonum/blas/gonum.Implementation.
func Use(b blas.Complex128) {
cblas128 = b
}
// Implementation returns the current BLAS complex128 implementation.
//
// Implementation allows direct calls to the current the BLAS complex128 implementation
// giving finer control of parameters.
func Implementation() blas.Complex128 {
return cblas128
}
// Vector represents a vector with an associated element increment.
type Vector struct {
N int
Inc int
Data []complex128
}
// General represents a matrix using the conventional storage scheme.
type General struct {
Rows, Cols int
Stride int
Data []complex128
}
// Band represents a band matrix using the band storage scheme.
type Band struct {
Rows, Cols int
KL, KU int
Stride int
Data []complex128
}
// Triangular represents a triangular matrix using the conventional storage scheme.
type Triangular struct {
N int
Stride int
Data []complex128
Uplo blas.Uplo
Diag blas.Diag
}
// TriangularBand represents a triangular matrix using the band storage scheme.
type TriangularBand struct {
N, K int
Stride int
Data []complex128
Uplo blas.Uplo
Diag blas.Diag
}
// TriangularPacked represents a triangular matrix using the packed storage scheme.
type TriangularPacked struct {
N int
Data []complex128
Uplo blas.Uplo
Diag blas.Diag
}
// Symmetric represents a symmetric matrix using the conventional storage scheme.
type Symmetric struct {
N int
Stride int
Data []complex128
Uplo blas.Uplo
}
// SymmetricBand represents a symmetric matrix using the band storage scheme.
type SymmetricBand struct {
N, K int
Stride int
Data []complex128
Uplo blas.Uplo
}
// SymmetricPacked represents a symmetric matrix using the packed storage scheme.
type SymmetricPacked struct {
N int
Data []complex128
Uplo blas.Uplo
}
// Hermitian represents an Hermitian matrix using the conventional storage scheme.
type Hermitian Symmetric
// HermitianBand represents an Hermitian matrix using the band storage scheme.
type HermitianBand SymmetricBand
// HermitianPacked represents an Hermitian matrix using the packed storage scheme.
type HermitianPacked SymmetricPacked
// Level 1
const (
negInc = "cblas128: negative vector increment"
badLength = "cblas128: vector length mismatch"
)
// Dotu computes the dot product of the two vectors without
// complex conjugation:
//
// xᵀ * y.
//
// Dotu will panic if the lengths of x and y do not match.
func Dotu(x, y Vector) complex128 {
if x.N != y.N {
panic(badLength)
}
return cblas128.Zdotu(x.N, x.Data, x.Inc, y.Data, y.Inc)
}
// Dotc computes the dot product of the two vectors with
// complex conjugation:
//
// xᴴ * y.
//
// Dotc will panic if the lengths of x and y do not match.
func Dotc(x, y Vector) complex128 {
if x.N != y.N {
panic(badLength)
}
return cblas128.Zdotc(x.N, x.Data, x.Inc, y.Data, y.Inc)
}
// Nrm2 computes the Euclidean norm of the vector x:
//
// sqrt(\sum_i x[i] * x[i]).
//
// Nrm2 will panic if the vector increment is negative.
func Nrm2(x Vector) float64 {
if x.Inc < 0 {
panic(negInc)
}
return cblas128.Dznrm2(x.N, x.Data, x.Inc)
}
// Asum computes the sum of magnitudes of the real and imaginary parts of
// elements of the vector x:
//
// \sum_i (|Re x[i]| + |Im x[i]|).
//
// Asum will panic if the vector increment is negative.
func Asum(x Vector) float64 {
if x.Inc < 0 {
panic(negInc)
}
return cblas128.Dzasum(x.N, x.Data, x.Inc)
}
// Iamax returns the index of an element of x with the largest sum of
// magnitudes of the real and imaginary parts (|Re x[i]|+|Im x[i]|).
// If there are multiple such indices, the earliest is returned.
//
// Iamax returns -1 if n == 0.
//
// Iamax will panic if the vector increment is negative.
func Iamax(x Vector) int {
if x.Inc < 0 {
panic(negInc)
}
return cblas128.Izamax(x.N, x.Data, x.Inc)
}
// Swap exchanges the elements of two vectors:
//
// x[i], y[i] = y[i], x[i] for all i.
//
// Swap will panic if the lengths of x and y do not match.
func Swap(x, y Vector) {
if x.N != y.N {
panic(badLength)
}
cblas128.Zswap(x.N, x.Data, x.Inc, y.Data, y.Inc)
}
// Copy copies the elements of x into the elements of y:
//
// y[i] = x[i] for all i.
//
// Copy will panic if the lengths of x and y do not match.
func Copy(x, y Vector) {
if x.N != y.N {
panic(badLength)
}
cblas128.Zcopy(x.N, x.Data, x.Inc, y.Data, y.Inc)
}
// Axpy computes
//
// y = alpha * x + y,
//
// where x and y are vectors, and alpha is a scalar.
// Axpy will panic if the lengths of x and y do not match.
func Axpy(alpha complex128, x, y Vector) {
if x.N != y.N {
panic(badLength)
}
cblas128.Zaxpy(x.N, alpha, x.Data, x.Inc, y.Data, y.Inc)
}
// Scal computes
//
// x = alpha * x,
//
// where x is a vector, and alpha is a scalar.
//
// Scal will panic if the vector increment is negative.
func Scal(alpha complex128, x Vector) {
if x.Inc < 0 {
panic(negInc)
}
cblas128.Zscal(x.N, alpha, x.Data, x.Inc)
}
// Dscal computes
//
// x = alpha * x,
//
// where x is a vector, and alpha is a real scalar.
//
// Dscal will panic if the vector increment is negative.
func Dscal(alpha float64, x Vector) {
if x.Inc < 0 {
panic(negInc)
}
cblas128.Zdscal(x.N, alpha, x.Data, x.Inc)
}
// Level 2
// Gemv computes
//
// y = alpha * A * x + beta * y if t == blas.NoTrans,
// y = alpha * Aᵀ * x + beta * y if t == blas.Trans,
// y = alpha * Aᴴ * x + beta * y if t == blas.ConjTrans,
//
// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are
// scalars.
func Gemv(t blas.Transpose, alpha complex128, a General, x Vector, beta complex128, y Vector) {
cblas128.Zgemv(t, a.Rows, a.Cols, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
}
// Gbmv computes
//
// y = alpha * A * x + beta * y if t == blas.NoTrans,
// y = alpha * Aᵀ * x + beta * y if t == blas.Trans,
// y = alpha * Aᴴ * x + beta * y if t == blas.ConjTrans,
//
// where A is an m×n band matrix, x and y are vectors, and alpha and beta are
// scalars.
func Gbmv(t blas.Transpose, alpha complex128, a Band, x Vector, beta complex128, y Vector) {
cblas128.Zgbmv(t, a.Rows, a.Cols, a.KL, a.KU, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
}
// Trmv computes
//
// x = A * x if t == blas.NoTrans,
// x = Aᵀ * x if t == blas.Trans,
// x = Aᴴ * x if t == blas.ConjTrans,
//
// where A is an n×n triangular matrix, and x is a vector.
func Trmv(t blas.Transpose, a Triangular, x Vector) {
cblas128.Ztrmv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
}
// Tbmv computes
//
// x = A * x if t == blas.NoTrans,
// x = Aᵀ * x if t == blas.Trans,
// x = Aᴴ * x if t == blas.ConjTrans,
//
// where A is an n×n triangular band matrix, and x is a vector.
func Tbmv(t blas.Transpose, a TriangularBand, x Vector) {
cblas128.Ztbmv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
}
// Tpmv computes
//
// x = A * x if t == blas.NoTrans,
// x = Aᵀ * x if t == blas.Trans,
// x = Aᴴ * x if t == blas.ConjTrans,
//
// where A is an n×n triangular matrix in packed format, and x is a vector.
func Tpmv(t blas.Transpose, a TriangularPacked, x Vector) {
cblas128.Ztpmv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
}
// Trsv solves
//
// A * x = b if t == blas.NoTrans,
// Aᵀ * x = b if t == blas.Trans,
// Aᴴ * x = b if t == blas.ConjTrans,
//
// where A is an n×n triangular matrix and x is a vector.
//
// At entry to the function, x contains the values of b, and the result is
// stored in-place into x.
//
// No test for singularity or near-singularity is included in this
// routine. Such tests must be performed before calling this routine.
func Trsv(t blas.Transpose, a Triangular, x Vector) {
cblas128.Ztrsv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
}
// Tbsv solves
//
// A * x = b if t == blas.NoTrans,
// Aᵀ * x = b if t == blas.Trans,
// Aᴴ * x = b if t == blas.ConjTrans,
//
// where A is an n×n triangular band matrix, and x is a vector.
//
// At entry to the function, x contains the values of b, and the result is
// stored in-place into x.
//
// No test for singularity or near-singularity is included in this
// routine. Such tests must be performed before calling this routine.
func Tbsv(t blas.Transpose, a TriangularBand, x Vector) {
cblas128.Ztbsv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
}
// Tpsv solves
//
// A * x = b if t == blas.NoTrans,
// Aᵀ * x = b if t == blas.Trans,
// Aᴴ * x = b if t == blas.ConjTrans,
//
// where A is an n×n triangular matrix in packed format and x is a vector.
//
// At entry to the function, x contains the values of b, and the result is
// stored in-place into x.
//
// No test for singularity or near-singularity is included in this
// routine. Such tests must be performed before calling this routine.
func Tpsv(t blas.Transpose, a TriangularPacked, x Vector) {
cblas128.Ztpsv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
}
// Hemv computes
//
// y = alpha * A * x + beta * y,
//
// where A is an n×n Hermitian matrix, x and y are vectors, and alpha and
// beta are scalars.
func Hemv(alpha complex128, a Hermitian, x Vector, beta complex128, y Vector) {
cblas128.Zhemv(a.Uplo, a.N, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
}
// Hbmv performs
//
// y = alpha * A * x + beta * y,
//
// where A is an n×n Hermitian band matrix, x and y are vectors, and alpha
// and beta are scalars.
func Hbmv(alpha complex128, a HermitianBand, x Vector, beta complex128, y Vector) {
cblas128.Zhbmv(a.Uplo, a.N, a.K, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
}
// Hpmv performs
//
// y = alpha * A * x + beta * y,
//
// where A is an n×n Hermitian matrix in packed format, x and y are vectors,
// and alpha and beta are scalars.
func Hpmv(alpha complex128, a HermitianPacked, x Vector, beta complex128, y Vector) {
cblas128.Zhpmv(a.Uplo, a.N, alpha, a.Data, x.Data, x.Inc, beta, y.Data, y.Inc)
}
// Geru performs a rank-1 update
//
// A += alpha * x * yᵀ,
//
// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
func Geru(alpha complex128, x, y Vector, a General) {
cblas128.Zgeru(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
}
// Gerc performs a rank-1 update
//
// A += alpha * x * yᴴ,
//
// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
func Gerc(alpha complex128, x, y Vector, a General) {
cblas128.Zgerc(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
}
// Her performs a rank-1 update
//
// A += alpha * x * yᵀ,
//
// where A is an m×n Hermitian matrix, x and y are vectors, and alpha is a scalar.
func Her(alpha float64, x Vector, a Hermitian) {
cblas128.Zher(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data, a.Stride)
}
// Hpr performs a rank-1 update
//
// A += alpha * x * xᴴ,
//
// where A is an n×n Hermitian matrix in packed format, x is a vector, and
// alpha is a scalar.
func Hpr(alpha float64, x Vector, a HermitianPacked) {
cblas128.Zhpr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data)
}
// Her2 performs a rank-2 update
//
// A += alpha * x * yᴴ + conj(alpha) * y * xᴴ,
//
// where A is an n×n Hermitian matrix, x and y are vectors, and alpha is a scalar.
func Her2(alpha complex128, x, y Vector, a Hermitian) {
cblas128.Zher2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
}
// Hpr2 performs a rank-2 update
//
// A += alpha * x * yᴴ + conj(alpha) * y * xᴴ,
//
// where A is an n×n Hermitian matrix in packed format, x and y are vectors,
// and alpha is a scalar.
func Hpr2(alpha complex128, x, y Vector, a HermitianPacked) {
cblas128.Zhpr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data)
}
// Level 3
// Gemm computes
//
// C = alpha * A * B + beta * C,
//
// where A, B, and C are dense matrices, and alpha and beta are scalars.
// tA and tB specify whether A or B are transposed or conjugated.
func Gemm(tA, tB blas.Transpose, alpha complex128, a, b General, beta complex128, c General) {
var m, n, k int
if tA == blas.NoTrans {
m, k = a.Rows, a.Cols
} else {
m, k = a.Cols, a.Rows
}
if tB == blas.NoTrans {
n = b.Cols
} else {
n = b.Rows
}
cblas128.Zgemm(tA, tB, m, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
}
// Symm performs
//
// C = alpha * A * B + beta * C if s == blas.Left,
// C = alpha * B * A + beta * C if s == blas.Right,
//
// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and
// alpha and beta are scalars.
func Symm(s blas.Side, alpha complex128, a Symmetric, b General, beta complex128, c General) {
var m, n int
if s == blas.Left {
m, n = a.N, b.Cols
} else {
m, n = b.Rows, a.N
}
cblas128.Zsymm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
}
// Syrk performs a symmetric rank-k update
//
// C = alpha * A * Aᵀ + beta * C if t == blas.NoTrans,
// C = alpha * Aᵀ * A + beta * C if t == blas.Trans,
//
// where C is an n×n symmetric matrix, A is an n×k matrix if t == blas.NoTrans
// and a k×n matrix otherwise, and alpha and beta are scalars.
func Syrk(t blas.Transpose, alpha complex128, a General, beta complex128, c Symmetric) {
var n, k int
if t == blas.NoTrans {
n, k = a.Rows, a.Cols
} else {
n, k = a.Cols, a.Rows
}
cblas128.Zsyrk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
}
// Syr2k performs a symmetric rank-2k update
//
// C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C if t == blas.NoTrans,
// C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C if t == blas.Trans,
//
// where C is an n×n symmetric matrix, A and B are n×k matrices if
// t == blas.NoTrans and k×n otherwise, and alpha and beta are scalars.
func Syr2k(t blas.Transpose, alpha complex128, a, b General, beta complex128, c Symmetric) {
var n, k int
if t == blas.NoTrans {
n, k = a.Rows, a.Cols
} else {
n, k = a.Cols, a.Rows
}
cblas128.Zsyr2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
}
// Trmm performs
//
// B = alpha * A * B if tA == blas.NoTrans and s == blas.Left,
// B = alpha * Aᵀ * B if tA == blas.Trans and s == blas.Left,
// B = alpha * Aᴴ * B if tA == blas.ConjTrans and s == blas.Left,
// B = alpha * B * A if tA == blas.NoTrans and s == blas.Right,
// B = alpha * B * Aᵀ if tA == blas.Trans and s == blas.Right,
// B = alpha * B * Aᴴ if tA == blas.ConjTrans and s == blas.Right,
//
// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is
// a scalar.
func Trmm(s blas.Side, tA blas.Transpose, alpha complex128, a Triangular, b General) {
cblas128.Ztrmm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
}
// Trsm solves
//
// A * X = alpha * B if tA == blas.NoTrans and s == blas.Left,
// Aᵀ * X = alpha * B if tA == blas.Trans and s == blas.Left,
// Aᴴ * X = alpha * B if tA == blas.ConjTrans and s == blas.Left,
// X * A = alpha * B if tA == blas.NoTrans and s == blas.Right,
// X * Aᵀ = alpha * B if tA == blas.Trans and s == blas.Right,
// X * Aᴴ = alpha * B if tA == blas.ConjTrans and s == blas.Right,
//
// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and
// alpha is a scalar.
//
// At entry to the function, b contains the values of B, and the result is
// stored in-place into b.
//
// No check is made that A is invertible.
func Trsm(s blas.Side, tA blas.Transpose, alpha complex128, a Triangular, b General) {
cblas128.Ztrsm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
}
// Hemm performs
//
// C = alpha * A * B + beta * C if s == blas.Left,
// C = alpha * B * A + beta * C if s == blas.Right,
//
// where A is an n×n or m×m Hermitian matrix, B and C are m×n matrices, and
// alpha and beta are scalars.
func Hemm(s blas.Side, alpha complex128, a Hermitian, b General, beta complex128, c General) {
var m, n int
if s == blas.Left {
m, n = a.N, b.Cols
} else {
m, n = b.Rows, a.N
}
cblas128.Zhemm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
}
// Herk performs the Hermitian rank-k update
//
// C = alpha * A * Aᴴ + beta*C if t == blas.NoTrans,
// C = alpha * Aᴴ * A + beta*C if t == blas.ConjTrans,
//
// where C is an n×n Hermitian matrix, A is an n×k matrix if t == blas.NoTrans
// and a k×n matrix otherwise, and alpha and beta are scalars.
func Herk(t blas.Transpose, alpha float64, a General, beta float64, c Hermitian) {
var n, k int
if t == blas.NoTrans {
n, k = a.Rows, a.Cols
} else {
n, k = a.Cols, a.Rows
}
cblas128.Zherk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
}
// Her2k performs the Hermitian rank-2k update
//
// C = alpha * A * Bᴴ + conj(alpha) * B * Aᴴ + beta * C if t == blas.NoTrans,
// C = alpha * Aᴴ * B + conj(alpha) * Bᴴ * A + beta * C if t == blas.ConjTrans,
//
// where C is an n×n Hermitian matrix, A and B are n×k matrices if t == NoTrans
// and k×n matrices otherwise, and alpha and beta are scalars.
func Her2k(t blas.Transpose, alpha complex128, a, b General, beta float64, c Hermitian) {
var n, k int
if t == blas.NoTrans {
n, k = a.Rows, a.Cols
} else {
n, k = a.Cols, a.Rows
}
cblas128.Zher2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
}

279
vendor/gonum.org/v1/gonum/blas/cblas128/conv.go generated vendored Normal file
View File

@@ -0,0 +1,279 @@
// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cblas128
import "gonum.org/v1/gonum/blas"
// GeneralCols represents a matrix using the conventional column-major storage scheme.
type GeneralCols General
// From fills the receiver with elements from a. The receiver
// must have the same dimensions as a and have adequate backing
// data storage.
func (t GeneralCols) From(a General) {
if t.Rows != a.Rows || t.Cols != a.Cols {
panic("cblas128: mismatched dimension")
}
if len(t.Data) < (t.Cols-1)*t.Stride+t.Rows {
panic("cblas128: short data slice")
}
for i := 0; i < a.Rows; i++ {
for j, v := range a.Data[i*a.Stride : i*a.Stride+a.Cols] {
t.Data[i+j*t.Stride] = v
}
}
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions as a and have adequate backing
// data storage.
func (t General) From(a GeneralCols) {
if t.Rows != a.Rows || t.Cols != a.Cols {
panic("cblas128: mismatched dimension")
}
if len(t.Data) < (t.Rows-1)*t.Stride+t.Cols {
panic("cblas128: short data slice")
}
for j := 0; j < a.Cols; j++ {
for i, v := range a.Data[j*a.Stride : j*a.Stride+a.Rows] {
t.Data[i*t.Stride+j] = v
}
}
}
// TriangularCols represents a matrix using the conventional column-major storage scheme.
type TriangularCols Triangular
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, uplo and diag as a and have
// adequate backing data storage.
func (t TriangularCols) From(a Triangular) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
if t.Diag != a.Diag {
panic("cblas128: mismatched BLAS diag")
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
for i := 0; i < a.N; i++ {
for j := i; j < a.N; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
case blas.Lower:
for i := 0; i < a.N; i++ {
for j := 0; j <= i; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
case blas.All:
for i := 0; i < a.N; i++ {
for j := 0; j < a.N; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
}
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, uplo and diag as a and have
// adequate backing data storage.
func (t Triangular) From(a TriangularCols) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
if t.Diag != a.Diag {
panic("cblas128: mismatched BLAS diag")
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
for i := 0; i < a.N; i++ {
for j := i; j < a.N; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
case blas.Lower:
for i := 0; i < a.N; i++ {
for j := 0; j <= i; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
case blas.All:
for i := 0; i < a.N; i++ {
for j := 0; j < a.N; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
}
}
// BandCols represents a matrix using the band column-major storage scheme.
type BandCols Band
// From fills the receiver with elements from a. The receiver
// must have the same dimensions and bandwidth as a and have
// adequate backing data storage.
func (t BandCols) From(a Band) {
if t.Rows != a.Rows || t.Cols != a.Cols {
panic("cblas128: mismatched dimension")
}
if t.KL != a.KL || t.KU != a.KU {
panic("cblas128: mismatched bandwidth")
}
if a.Stride < a.KL+a.KU+1 {
panic("cblas128: short stride for source")
}
if t.Stride < t.KL+t.KU+1 {
panic("cblas128: short stride for destination")
}
for i := 0; i < a.Rows; i++ {
for j := max(0, i-a.KL); j < min(i+a.KU+1, a.Cols); j++ {
t.Data[i+t.KU-j+j*t.Stride] = a.Data[j+a.KL-i+i*a.Stride]
}
}
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions and bandwidth as a and have
// adequate backing data storage.
func (t Band) From(a BandCols) {
if t.Rows != a.Rows || t.Cols != a.Cols {
panic("cblas128: mismatched dimension")
}
if t.KL != a.KL || t.KU != a.KU {
panic("cblas128: mismatched bandwidth")
}
if a.Stride < a.KL+a.KU+1 {
panic("cblas128: short stride for source")
}
if t.Stride < t.KL+t.KU+1 {
panic("cblas128: short stride for destination")
}
for j := 0; j < a.Cols; j++ {
for i := max(0, j-a.KU); i < min(j+a.KL+1, a.Rows); i++ {
t.Data[j+a.KL-i+i*a.Stride] = a.Data[i+t.KU-j+j*t.Stride]
}
}
}
// TriangularBandCols represents a triangular matrix using the band column-major storage scheme.
type TriangularBandCols TriangularBand
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, bandwidth and uplo as a and
// have adequate backing data storage.
func (t TriangularBandCols) From(a TriangularBand) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.K != a.K {
panic("cblas128: mismatched bandwidth")
}
if a.Stride < a.K+1 {
panic("cblas128: short stride for source")
}
if t.Stride < t.K+1 {
panic("cblas128: short stride for destination")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
if t.Diag != a.Diag {
panic("cblas128: mismatched BLAS diag")
}
dst := BandCols{
Rows: t.N, Cols: t.N,
Stride: t.Stride,
Data: t.Data,
}
src := Band{
Rows: a.N, Cols: a.N,
Stride: a.Stride,
Data: a.Data,
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
dst.KU = t.K
src.KU = a.K
case blas.Lower:
dst.KL = t.K
src.KL = a.K
}
dst.From(src)
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, bandwidth and uplo as a and
// have adequate backing data storage.
func (t TriangularBand) From(a TriangularBandCols) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.K != a.K {
panic("cblas128: mismatched bandwidth")
}
if a.Stride < a.K+1 {
panic("cblas128: short stride for source")
}
if t.Stride < t.K+1 {
panic("cblas128: short stride for destination")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
if t.Diag != a.Diag {
panic("cblas128: mismatched BLAS diag")
}
dst := Band{
Rows: t.N, Cols: t.N,
Stride: t.Stride,
Data: t.Data,
}
src := BandCols{
Rows: a.N, Cols: a.N,
Stride: a.Stride,
Data: a.Data,
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
dst.KU = t.K
src.KU = a.K
case blas.Lower:
dst.KL = t.K
src.KL = a.K
}
dst.From(src)
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func max(a, b int) int {
if a > b {
return a
}
return b
}

View File

@@ -0,0 +1,155 @@
// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cblas128
import "gonum.org/v1/gonum/blas"
// HermitianCols represents a matrix using the conventional column-major storage scheme.
type HermitianCols Hermitian
// From fills the receiver with elements from a. The receiver
// must have the same dimensions and uplo as a and have adequate
// backing data storage.
func (t HermitianCols) From(a Hermitian) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
for i := 0; i < a.N; i++ {
for j := i; j < a.N; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
case blas.Lower:
for i := 0; i < a.N; i++ {
for j := 0; j <= i; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
}
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions and uplo as a and have adequate
// backing data storage.
func (t Hermitian) From(a HermitianCols) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
for i := 0; i < a.N; i++ {
for j := i; j < a.N; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
case blas.Lower:
for i := 0; i < a.N; i++ {
for j := 0; j <= i; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
}
}
// HermitianBandCols represents an Hermitian matrix using the band column-major storage scheme.
type HermitianBandCols HermitianBand
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, bandwidth and uplo as a and
// have adequate backing data storage.
func (t HermitianBandCols) From(a HermitianBand) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.K != a.K {
panic("cblas128: mismatched bandwidth")
}
if a.Stride < a.K+1 {
panic("cblas128: short stride for source")
}
if t.Stride < t.K+1 {
panic("cblas128: short stride for destination")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
dst := BandCols{
Rows: t.N, Cols: t.N,
Stride: t.Stride,
Data: t.Data,
}
src := Band{
Rows: a.N, Cols: a.N,
Stride: a.Stride,
Data: a.Data,
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
dst.KU = t.K
src.KU = a.K
case blas.Lower:
dst.KL = t.K
src.KL = a.K
}
dst.From(src)
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, bandwidth and uplo as a and
// have adequate backing data storage.
func (t HermitianBand) From(a HermitianBandCols) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.K != a.K {
panic("cblas128: mismatched bandwidth")
}
if a.Stride < a.K+1 {
panic("cblas128: short stride for source")
}
if t.Stride < t.K+1 {
panic("cblas128: short stride for destination")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
dst := Band{
Rows: t.N, Cols: t.N,
Stride: t.Stride,
Data: t.Data,
}
src := BandCols{
Rows: a.N, Cols: a.N,
Stride: a.Stride,
Data: a.Data,
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
dst.KU = t.K
src.KU = a.K
case blas.Lower:
dst.KL = t.K
src.KL = a.K
}
dst.From(src)
}

View File

@@ -0,0 +1,155 @@
// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package cblas128
import "gonum.org/v1/gonum/blas"
// SymmetricCols represents a matrix using the conventional column-major storage scheme.
type SymmetricCols Symmetric
// From fills the receiver with elements from a. The receiver
// must have the same dimensions and uplo as a and have adequate
// backing data storage.
func (t SymmetricCols) From(a Symmetric) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
for i := 0; i < a.N; i++ {
for j := i; j < a.N; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
case blas.Lower:
for i := 0; i < a.N; i++ {
for j := 0; j <= i; j++ {
t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
}
}
}
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions and uplo as a and have adequate
// backing data storage.
func (t Symmetric) From(a SymmetricCols) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
for i := 0; i < a.N; i++ {
for j := i; j < a.N; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
case blas.Lower:
for i := 0; i < a.N; i++ {
for j := 0; j <= i; j++ {
t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
}
}
}
}
// SymmetricBandCols represents a symmetric matrix using the band column-major storage scheme.
type SymmetricBandCols SymmetricBand
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, bandwidth and uplo as a and
// have adequate backing data storage.
func (t SymmetricBandCols) From(a SymmetricBand) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.K != a.K {
panic("cblas128: mismatched bandwidth")
}
if a.Stride < a.K+1 {
panic("cblas128: short stride for source")
}
if t.Stride < t.K+1 {
panic("cblas128: short stride for destination")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
dst := BandCols{
Rows: t.N, Cols: t.N,
Stride: t.Stride,
Data: t.Data,
}
src := Band{
Rows: a.N, Cols: a.N,
Stride: a.Stride,
Data: a.Data,
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
dst.KU = t.K
src.KU = a.K
case blas.Lower:
dst.KL = t.K
src.KL = a.K
}
dst.From(src)
}
// From fills the receiver with elements from a. The receiver
// must have the same dimensions, bandwidth and uplo as a and
// have adequate backing data storage.
func (t SymmetricBand) From(a SymmetricBandCols) {
if t.N != a.N {
panic("cblas128: mismatched dimension")
}
if t.K != a.K {
panic("cblas128: mismatched bandwidth")
}
if a.Stride < a.K+1 {
panic("cblas128: short stride for source")
}
if t.Stride < t.K+1 {
panic("cblas128: short stride for destination")
}
if t.Uplo != a.Uplo {
panic("cblas128: mismatched BLAS uplo")
}
dst := Band{
Rows: t.N, Cols: t.N,
Stride: t.Stride,
Data: t.Data,
}
src := BandCols{
Rows: a.N, Cols: a.N,
Stride: a.Stride,
Data: a.Data,
}
switch a.Uplo {
default:
panic("cblas128: bad BLAS uplo")
case blas.Upper:
dst.KU = t.K
src.KU = a.K
case blas.Lower:
dst.KL = t.K
src.KL = a.K
}
dst.From(src)
}

6
vendor/gonum.org/v1/gonum/blas/cblas128/doc.go generated vendored Normal file
View File

@@ -0,0 +1,6 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package cblas128 provides a simple interface to the complex128 BLAS API.
package cblas128 // import "gonum.org/v1/gonum/blas/cblas128"

159
vendor/gonum.org/v1/gonum/blas/conversions.bash generated vendored Normal file
View File

@@ -0,0 +1,159 @@
#!/usr/bin/env bash
# Copyright ©2017 The Gonum Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
# Generate code for blas32.
echo Generating blas32/conv.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv.go
cat blas64/conv.go \
| gofmt -r 'float64 -> float32' \
\
| sed -e 's/blas64/blas32/' \
\
>> blas32/conv.go
echo Generating blas32/conv_test.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_test.go
cat blas64/conv_test.go \
| gofmt -r 'float64 -> float32' \
\
| sed -e 's/blas64/blas32/' \
-e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
\
>> blas32/conv_test.go
echo Generating blas32/conv_symmetric.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_symmetric.go
cat blas64/conv_symmetric.go \
| gofmt -r 'float64 -> float32' \
\
| sed -e 's/blas64/blas32/' \
\
>> blas32/conv_symmetric.go
echo Generating blas32/conv_symmetric_test.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_symmetric_test.go
cat blas64/conv_symmetric_test.go \
| gofmt -r 'float64 -> float32' \
\
| sed -e 's/blas64/blas32/' \
-e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
\
>> blas32/conv_symmetric_test.go
# Generate code for cblas128.
echo Generating cblas128/conv.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv.go
cat blas64/conv.go \
| gofmt -r 'float64 -> complex128' \
\
| sed -e 's/blas64/cblas128/' \
\
>> cblas128/conv.go
echo Generating cblas128/conv_test.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_test.go
cat blas64/conv_test.go \
| gofmt -r 'float64 -> complex128' \
\
| sed -e 's/blas64/cblas128/' \
-e 's_"math"_math "math/cmplx"_' \
\
>> cblas128/conv_test.go
echo Generating cblas128/conv_symmetric.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_symmetric.go
cat blas64/conv_symmetric.go \
| gofmt -r 'float64 -> complex128' \
\
| sed -e 's/blas64/cblas128/' \
\
>> cblas128/conv_symmetric.go
echo Generating cblas128/conv_symmetric_test.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_symmetric_test.go
cat blas64/conv_symmetric_test.go \
| gofmt -r 'float64 -> complex128' \
\
| sed -e 's/blas64/cblas128/' \
-e 's_"math"_math "math/cmplx"_' \
\
>> cblas128/conv_symmetric_test.go
echo Generating cblas128/conv_hermitian.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_hermitian.go
cat blas64/conv_symmetric.go \
| gofmt -r 'float64 -> complex128' \
\
| sed -e 's/blas64/cblas128/' \
-e 's/Symmetric/Hermitian/g' \
-e 's/a symmetric/an Hermitian/g' \
-e 's/symmetric/hermitian/g' \
-e 's/Sym/Herm/g' \
\
>> cblas128/conv_hermitian.go
echo Generating cblas128/conv_hermitian_test.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_hermitian_test.go
cat blas64/conv_symmetric_test.go \
| gofmt -r 'float64 -> complex128' \
\
| sed -e 's/blas64/cblas128/' \
-e 's/Symmetric/Hermitian/g' \
-e 's/a symmetric/an Hermitian/g' \
-e 's/symmetric/hermitian/g' \
-e 's/Sym/Herm/g' \
-e 's_"math"_math "math/cmplx"_' \
\
>> cblas128/conv_hermitian_test.go
# Generate code for cblas64.
echo Generating cblas64/conv.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv.go
cat blas64/conv.go \
| gofmt -r 'float64 -> complex64' \
\
| sed -e 's/blas64/cblas64/' \
\
>> cblas64/conv.go
echo Generating cblas64/conv_test.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_test.go
cat blas64/conv_test.go \
| gofmt -r 'float64 -> complex64' \
\
| sed -e 's/blas64/cblas64/' \
-e 's_"math"_math "gonum.org/v1/gonum/internal/cmplx64"_' \
\
>> cblas64/conv_test.go
echo Generating cblas64/conv_hermitian.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_hermitian.go
cat blas64/conv_symmetric.go \
| gofmt -r 'float64 -> complex64' \
\
| sed -e 's/blas64/cblas64/' \
-e 's/Symmetric/Hermitian/g' \
-e 's/a symmetric/an Hermitian/g' \
-e 's/symmetric/hermitian/g' \
-e 's/Sym/Herm/g' \
\
>> cblas64/conv_hermitian.go
echo Generating cblas64/conv_hermitian_test.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_hermitian_test.go
cat blas64/conv_symmetric_test.go \
| gofmt -r 'float64 -> complex64' \
\
| sed -e 's/blas64/cblas64/' \
-e 's/Symmetric/Hermitian/g' \
-e 's/a symmetric/an Hermitian/g' \
-e 's/symmetric/hermitian/g' \
-e 's/Sym/Herm/g' \
-e 's_"math"_math "gonum.org/v1/gonum/internal/cmplx64"_' \
\
>> cblas64/conv_hermitian_test.go

108
vendor/gonum.org/v1/gonum/blas/doc.go generated vendored Normal file
View File

@@ -0,0 +1,108 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
/*
Package blas provides interfaces for the BLAS linear algebra standard.
All methods must perform appropriate parameter checking and panic if
provided parameters that do not conform to the requirements specified
by the BLAS standard.
Quick Reference Guide to the BLAS from http://www.netlib.org/lapack/lug/node145.html
This version is modified to remove the "order" option. All matrix operations are
on row-order matrices.
Level 1 BLAS
dim scalar vector vector scalars 5-element prefixes
struct
_rotg ( a, b ) S, D
_rotmg( d1, d2, a, b ) S, D
_rot ( n, x, incX, y, incY, c, s ) S, D
_rotm ( n, x, incX, y, incY, param ) S, D
_swap ( n, x, incX, y, incY ) S, D, C, Z
_scal ( n, alpha, x, incX ) S, D, C, Z, Cs, Zd
_copy ( n, x, incX, y, incY ) S, D, C, Z
_axpy ( n, alpha, x, incX, y, incY ) S, D, C, Z
_dot ( n, x, incX, y, incY ) S, D, Ds
_dotu ( n, x, incX, y, incY ) C, Z
_dotc ( n, x, incX, y, incY ) C, Z
__dot ( n, alpha, x, incX, y, incY ) Sds
_nrm2 ( n, x, incX ) S, D, Sc, Dz
_asum ( n, x, incX ) S, D, Sc, Dz
I_amax( n, x, incX ) s, d, c, z
Level 2 BLAS
options dim b-width scalar matrix vector scalar vector prefixes
_gemv ( trans, m, n, alpha, a, lda, x, incX, beta, y, incY ) S, D, C, Z
_gbmv ( trans, m, n, kL, kU, alpha, a, lda, x, incX, beta, y, incY ) S, D, C, Z
_hemv ( uplo, n, alpha, a, lda, x, incX, beta, y, incY ) C, Z
_hbmv ( uplo, n, k, alpha, a, lda, x, incX, beta, y, incY ) C, Z
_hpmv ( uplo, n, alpha, ap, x, incX, beta, y, incY ) C, Z
_symv ( uplo, n, alpha, a, lda, x, incX, beta, y, incY ) S, D
_sbmv ( uplo, n, k, alpha, a, lda, x, incX, beta, y, incY ) S, D
_spmv ( uplo, n, alpha, ap, x, incX, beta, y, incY ) S, D
_trmv ( uplo, trans, diag, n, a, lda, x, incX ) S, D, C, Z
_tbmv ( uplo, trans, diag, n, k, a, lda, x, incX ) S, D, C, Z
_tpmv ( uplo, trans, diag, n, ap, x, incX ) S, D, C, Z
_trsv ( uplo, trans, diag, n, a, lda, x, incX ) S, D, C, Z
_tbsv ( uplo, trans, diag, n, k, a, lda, x, incX ) S, D, C, Z
_tpsv ( uplo, trans, diag, n, ap, x, incX ) S, D, C, Z
options dim scalar vector vector matrix prefixes
_ger ( m, n, alpha, x, incX, y, incY, a, lda ) S, D
_geru ( m, n, alpha, x, incX, y, incY, a, lda ) C, Z
_gerc ( m, n, alpha, x, incX, y, incY, a, lda ) C, Z
_her ( uplo, n, alpha, x, incX, a, lda ) C, Z
_hpr ( uplo, n, alpha, x, incX, ap ) C, Z
_her2 ( uplo, n, alpha, x, incX, y, incY, a, lda ) C, Z
_hpr2 ( uplo, n, alpha, x, incX, y, incY, ap ) C, Z
_syr ( uplo, n, alpha, x, incX, a, lda ) S, D
_spr ( uplo, n, alpha, x, incX, ap ) S, D
_syr2 ( uplo, n, alpha, x, incX, y, incY, a, lda ) S, D
_spr2 ( uplo, n, alpha, x, incX, y, incY, ap ) S, D
Level 3 BLAS
options dim scalar matrix matrix scalar matrix prefixes
_gemm ( transA, transB, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc ) S, D, C, Z
_symm ( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc ) S, D, C, Z
_hemm ( side, uplo, m, n, alpha, a, lda, b, ldb, beta, c, ldc ) C, Z
_syrk ( uplo, trans, n, k, alpha, a, lda, beta, c, ldc ) S, D, C, Z
_herk ( uplo, trans, n, k, alpha, a, lda, beta, c, ldc ) C, Z
_syr2k( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc ) S, D, C, Z
_her2k( uplo, trans, n, k, alpha, a, lda, b, ldb, beta, c, ldc ) C, Z
_trmm ( side, uplo, transA, diag, m, n, alpha, a, lda, b, ldb ) S, D, C, Z
_trsm ( side, uplo, transA, diag, m, n, alpha, a, lda, b, ldb ) S, D, C, Z
Meaning of prefixes
S - float32 C - complex64
D - float64 Z - complex128
Matrix types
GE - GEneral GB - General Band
SY - SYmmetric SB - Symmetric Band SP - Symmetric Packed
HE - HErmitian HB - Hermitian Band HP - Hermitian Packed
TR - TRiangular TB - Triangular Band TP - Triangular Packed
Options
trans = NoTrans, Trans, ConjTrans
uplo = Upper, Lower
diag = Nonunit, Unit
side = Left, Right (A or op(A) on the left, or A or op(A) on the right)
For real matrices, Trans and ConjTrans have the same meaning.
For Hermitian matrices, trans = Trans is not allowed.
For complex symmetric matrices, trans = ConjTrans is not allowed.
*/
package blas // import "gonum.org/v1/gonum/blas"

297
vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go generated vendored Normal file
View File

@@ -0,0 +1,297 @@
// Copyright ©2014 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
"runtime"
"sync"
"gonum.org/v1/gonum/blas"
"gonum.org/v1/gonum/internal/asm/f64"
)
// Dgemm performs one of the matrix-matrix operations
//
// C = alpha * A * B + beta * C
// C = alpha * Aᵀ * B + beta * C
// C = alpha * A * Bᵀ + beta * C
// C = alpha * Aᵀ * Bᵀ + beta * C
//
// where A is an m×k or k×m dense matrix, B is an n×k or k×n dense matrix, C is
// an m×n matrix, and alpha and beta are scalars. tA and tB specify whether A or
// B are transposed.
func (Implementation) Dgemm(tA, tB blas.Transpose, m, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
switch tA {
default:
panic(badTranspose)
case blas.NoTrans, blas.Trans, blas.ConjTrans:
}
switch tB {
default:
panic(badTranspose)
case blas.NoTrans, blas.Trans, blas.ConjTrans:
}
if m < 0 {
panic(mLT0)
}
if n < 0 {
panic(nLT0)
}
if k < 0 {
panic(kLT0)
}
aTrans := tA == blas.Trans || tA == blas.ConjTrans
if aTrans {
if lda < max(1, m) {
panic(badLdA)
}
} else {
if lda < max(1, k) {
panic(badLdA)
}
}
bTrans := tB == blas.Trans || tB == blas.ConjTrans
if bTrans {
if ldb < max(1, k) {
panic(badLdB)
}
} else {
if ldb < max(1, n) {
panic(badLdB)
}
}
if ldc < max(1, n) {
panic(badLdC)
}
// Quick return if possible.
if m == 0 || n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if aTrans {
if len(a) < (k-1)*lda+m {
panic(shortA)
}
} else {
if len(a) < (m-1)*lda+k {
panic(shortA)
}
}
if bTrans {
if len(b) < (n-1)*ldb+k {
panic(shortB)
}
} else {
if len(b) < (k-1)*ldb+n {
panic(shortB)
}
}
if len(c) < (m-1)*ldc+n {
panic(shortC)
}
// Quick return if possible.
if (alpha == 0 || k == 0) && beta == 1 {
return
}
// scale c
if beta != 1 {
if beta == 0 {
for i := 0; i < m; i++ {
ctmp := c[i*ldc : i*ldc+n]
for j := range ctmp {
ctmp[j] = 0
}
}
} else {
for i := 0; i < m; i++ {
ctmp := c[i*ldc : i*ldc+n]
for j := range ctmp {
ctmp[j] *= beta
}
}
}
}
dgemmParallel(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
}
func dgemmParallel(aTrans, bTrans bool, m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
// dgemmParallel computes a parallel matrix multiplication by partitioning
// a and b into sub-blocks, and updating c with the multiplication of the sub-block
// In all cases,
// A = [ A_11 A_12 ... A_1j
// A_21 A_22 ... A_2j
// ...
// A_i1 A_i2 ... A_ij]
//
// and same for B. All of the submatrix sizes are blockSize×blockSize except
// at the edges.
//
// In all cases, there is one dimension for each matrix along which
// C must be updated sequentially.
// Cij = \sum_k Aik Bki, (A * B)
// Cij = \sum_k Aki Bkj, (Aᵀ * B)
// Cij = \sum_k Aik Bjk, (A * Bᵀ)
// Cij = \sum_k Aki Bjk, (Aᵀ * Bᵀ)
//
// This code computes one {i, j} block sequentially along the k dimension,
// and computes all of the {i, j} blocks concurrently. This
// partitioning allows Cij to be updated in-place without race-conditions.
// Instead of launching a goroutine for each possible concurrent computation,
// a number of worker goroutines are created and channels are used to pass
// available and completed cases.
//
// http://alexkr.com/docs/matrixmult.pdf is a good reference on matrix-matrix
// multiplies, though this code does not copy matrices to attempt to eliminate
// cache misses.
maxKLen := k
parBlocks := blocks(m, blockSize) * blocks(n, blockSize)
if parBlocks < minParBlock {
// The matrix multiplication is small in the dimensions where it can be
// computed concurrently. Just do it in serial.
dgemmSerial(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
return
}
// workerLimit acts a number of maximum concurrent workers,
// with the limit set to the number of procs available.
workerLimit := make(chan struct{}, runtime.GOMAXPROCS(0))
// wg is used to wait for all
var wg sync.WaitGroup
wg.Add(parBlocks)
defer wg.Wait()
for i := 0; i < m; i += blockSize {
for j := 0; j < n; j += blockSize {
workerLimit <- struct{}{}
go func(i, j int) {
defer func() {
wg.Done()
<-workerLimit
}()
leni := blockSize
if i+leni > m {
leni = m - i
}
lenj := blockSize
if j+lenj > n {
lenj = n - j
}
cSub := sliceView64(c, ldc, i, j, leni, lenj)
// Compute A_ik B_kj for all k
for k := 0; k < maxKLen; k += blockSize {
lenk := blockSize
if k+lenk > maxKLen {
lenk = maxKLen - k
}
var aSub, bSub []float64
if aTrans {
aSub = sliceView64(a, lda, k, i, lenk, leni)
} else {
aSub = sliceView64(a, lda, i, k, leni, lenk)
}
if bTrans {
bSub = sliceView64(b, ldb, j, k, lenj, lenk)
} else {
bSub = sliceView64(b, ldb, k, j, lenk, lenj)
}
dgemmSerial(aTrans, bTrans, leni, lenj, lenk, aSub, lda, bSub, ldb, cSub, ldc, alpha)
}
}(i, j)
}
}
}
// dgemmSerial is serial matrix multiply
func dgemmSerial(aTrans, bTrans bool, m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
switch {
case !aTrans && !bTrans:
dgemmSerialNotNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
return
case aTrans && !bTrans:
dgemmSerialTransNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
return
case !aTrans && bTrans:
dgemmSerialNotTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
return
case aTrans && bTrans:
dgemmSerialTransTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
return
default:
panic("unreachable")
}
}
// dgemmSerial where neither a nor b are transposed
func dgemmSerialNotNot(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
// This style is used instead of the literal [i*stride +j]) is used because
// approximately 5 times faster as of go 1.3.
for i := 0; i < m; i++ {
ctmp := c[i*ldc : i*ldc+n]
for l, v := range a[i*lda : i*lda+k] {
tmp := alpha * v
if tmp != 0 {
f64.AxpyUnitary(tmp, b[l*ldb:l*ldb+n], ctmp)
}
}
}
}
// dgemmSerial where neither a is transposed and b is not
func dgemmSerialTransNot(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
// This style is used instead of the literal [i*stride +j]) is used because
// approximately 5 times faster as of go 1.3.
for l := 0; l < k; l++ {
btmp := b[l*ldb : l*ldb+n]
for i, v := range a[l*lda : l*lda+m] {
tmp := alpha * v
if tmp != 0 {
ctmp := c[i*ldc : i*ldc+n]
f64.AxpyUnitary(tmp, btmp, ctmp)
}
}
}
}
// dgemmSerial where neither a is not transposed and b is
func dgemmSerialNotTrans(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
// This style is used instead of the literal [i*stride +j]) is used because
// approximately 5 times faster as of go 1.3.
for i := 0; i < m; i++ {
atmp := a[i*lda : i*lda+k]
ctmp := c[i*ldc : i*ldc+n]
for j := 0; j < n; j++ {
ctmp[j] += alpha * f64.DotUnitary(atmp, b[j*ldb:j*ldb+k])
}
}
}
// dgemmSerial where both are transposed
func dgemmSerialTransTrans(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
// This style is used instead of the literal [i*stride +j]) is used because
// approximately 5 times faster as of go 1.3.
for l := 0; l < k; l++ {
for i, v := range a[l*lda : l*lda+m] {
tmp := alpha * v
if tmp != 0 {
ctmp := c[i*ldc : i*ldc+n]
f64.AxpyInc(tmp, b[l:], ctmp, uintptr(n), uintptr(ldb), 1, 0, 0)
}
}
}
}
func sliceView64(a []float64, lda, i, j, r, c int) []float64 {
return a[i*lda+j : (i+r-1)*lda+j+c]
}

99
vendor/gonum.org/v1/gonum/blas/gonum/doc.go generated vendored Normal file
View File

@@ -0,0 +1,99 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Ensure changes made to blas/native are reflected in blas/cgo where relevant.
/*
Package gonum is a Go implementation of the BLAS API. This implementation
panics when the input arguments are invalid as per the standard, for example
if a vector increment is zero. Note that the treatment of NaN values
is not specified, and differs among the BLAS implementations.
gonum.org/v1/gonum/blas/blas64 provides helpful wrapper functions to the BLAS
interface. The rest of this text describes the layout of the data for the input types.
Note that in the function documentation, x[i] refers to the i^th element
of the vector, which will be different from the i^th element of the slice if
incX != 1.
See http://www.netlib.org/lapack/explore-html/d4/de1/_l_i_c_e_n_s_e_source.html
for more license information.
Vector arguments are effectively strided slices. They have two input arguments,
a number of elements, n, and an increment, incX. The increment specifies the
distance between elements of the vector. The actual Go slice may be longer
than necessary.
The increment may be positive or negative, except in functions with only
a single vector argument where the increment may only be positive. If the increment
is negative, s[0] is the last element in the slice. Note that this is not the same
as counting backward from the end of the slice, as len(s) may be longer than
necessary. So, for example, if n = 5 and incX = 3, the elements of s are
[0 * * 1 * * 2 * * 3 * * 4 * * * ...]
where elements are never accessed. If incX = -3, the same elements are
accessed, just in reverse order (4, 3, 2, 1, 0).
Dense matrices are specified by a number of rows, a number of columns, and a stride.
The stride specifies the number of entries in the slice between the first element
of successive rows. The stride must be at least as large as the number of columns
but may be longer.
[a00 ... a0n a0* ... a1stride-1 a21 ... amn am* ... amstride-1]
Thus, dense[i*ld + j] refers to the {i, j}th element of the matrix.
Symmetric and triangular matrices (non-packed) are stored identically to Dense,
except that only elements in one triangle of the matrix are accessed.
Packed symmetric and packed triangular matrices are laid out with the entries
condensed such that all of the unreferenced elements are removed. So, the upper triangular
matrix
[
1 2 3
0 4 5
0 0 6
]
and the lower-triangular matrix
[
1 0 0
2 3 0
4 5 6
]
will both be compacted as [1 2 3 4 5 6]. The (i, j) element of the original
dense matrix can be found at element i*n - (i-1)*i/2 + j for upper triangular,
and at element i * (i+1) /2 + j for lower triangular.
Banded matrices are laid out in a compact format, constructed by removing the
zeros in the rows and aligning the diagonals. For example, the matrix
[
1 2 3 0 0 0
4 5 6 7 0 0
0 8 9 10 11 0
0 0 12 13 14 15
0 0 0 16 17 18
0 0 0 0 19 20
]
implicitly becomes ( entries are never accessed)
[
* 1 2 3
4 5 6 7
8 9 10 11
12 13 14 15
16 17 18 *
19 20 * *
]
which is given to the BLAS routine as [ 1 2 3 4 ...].
See http://www.crest.iu.edu/research/mtl/reference/html/banded.html
for more information
*/
package gonum // import "gonum.org/v1/gonum/blas/gonum"

35
vendor/gonum.org/v1/gonum/blas/gonum/errors.go generated vendored Normal file
View File

@@ -0,0 +1,35 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
// Panic strings used during parameter checks.
// This list is duplicated in netlib/blas/netlib. Keep in sync.
const (
zeroIncX = "blas: zero x index increment"
zeroIncY = "blas: zero y index increment"
mLT0 = "blas: m < 0"
nLT0 = "blas: n < 0"
kLT0 = "blas: k < 0"
kLLT0 = "blas: kL < 0"
kULT0 = "blas: kU < 0"
badUplo = "blas: illegal triangle"
badTranspose = "blas: illegal transpose"
badDiag = "blas: illegal diagonal"
badSide = "blas: illegal side"
badFlag = "blas: illegal rotm flag"
badLdA = "blas: bad leading dimension of A"
badLdB = "blas: bad leading dimension of B"
badLdC = "blas: bad leading dimension of C"
shortX = "blas: insufficient length of x"
shortY = "blas: insufficient length of y"
shortAP = "blas: insufficient length of ap"
shortA = "blas: insufficient length of a"
shortB = "blas: insufficient length of b"
shortC = "blas: insufficient length of c"
)

52
vendor/gonum.org/v1/gonum/blas/gonum/gonum.go generated vendored Normal file
View File

@@ -0,0 +1,52 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate ./single_precision.bash
package gonum
import (
"math"
"gonum.org/v1/gonum/internal/math32"
)
type Implementation struct{}
// [SD]gemm behavior constants. These are kept here to keep them out of the
// way during single precision code generation.
const (
blockSize = 64 // b x b matrix
minParBlock = 4 // minimum number of blocks needed to go parallel
)
func max(a, b int) int {
if a > b {
return a
}
return b
}
func min(a, b int) int {
if a > b {
return b
}
return a
}
// blocks returns the number of divisions of the dimension length with the given
// block size.
func blocks(dim, bsize int) int {
return (dim + bsize - 1) / bsize
}
// dcabs1 returns |real(z)|+|imag(z)|.
func dcabs1(z complex128) float64 {
return math.Abs(real(z)) + math.Abs(imag(z))
}
// scabs1 returns |real(z)|+|imag(z)|.
func scabs1(z complex64) float32 {
return math32.Abs(real(z)) + math32.Abs(imag(z))
}

454
vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go generated vendored Normal file
View File

@@ -0,0 +1,454 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
"math"
"gonum.org/v1/gonum/blas"
"gonum.org/v1/gonum/internal/asm/c128"
)
var _ blas.Complex128Level1 = Implementation{}
// Dzasum returns the sum of the absolute values of the elements of x
//
// \sum_i |Re(x[i])| + |Im(x[i])|
//
// Dzasum returns 0 if incX is negative.
func (Implementation) Dzasum(n int, x []complex128, incX int) float64 {
if n < 0 {
panic(nLT0)
}
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return 0
}
var sum float64
if incX == 1 {
if len(x) < n {
panic(shortX)
}
for _, v := range x[:n] {
sum += dcabs1(v)
}
return sum
}
if (n-1)*incX >= len(x) {
panic(shortX)
}
for i := 0; i < n; i++ {
v := x[i*incX]
sum += dcabs1(v)
}
return sum
}
// Dznrm2 computes the Euclidean norm of the complex vector x,
//
// ‖x‖_2 = sqrt(\sum_i x[i] * conj(x[i])).
//
// This function returns 0 if incX is negative.
func (Implementation) Dznrm2(n int, x []complex128, incX int) float64 {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return 0
}
if n < 1 {
if n == 0 {
return 0
}
panic(nLT0)
}
if (n-1)*incX >= len(x) {
panic(shortX)
}
var (
scale float64
ssq float64 = 1
)
if incX == 1 {
for _, v := range x[:n] {
re, im := math.Abs(real(v)), math.Abs(imag(v))
if re != 0 {
if re > scale {
ssq = 1 + ssq*(scale/re)*(scale/re)
scale = re
} else {
ssq += (re / scale) * (re / scale)
}
}
if im != 0 {
if im > scale {
ssq = 1 + ssq*(scale/im)*(scale/im)
scale = im
} else {
ssq += (im / scale) * (im / scale)
}
}
}
if math.IsInf(scale, 1) {
return math.Inf(1)
}
return scale * math.Sqrt(ssq)
}
for ix := 0; ix < n*incX; ix += incX {
re, im := math.Abs(real(x[ix])), math.Abs(imag(x[ix]))
if re != 0 {
if re > scale {
ssq = 1 + ssq*(scale/re)*(scale/re)
scale = re
} else {
ssq += (re / scale) * (re / scale)
}
}
if im != 0 {
if im > scale {
ssq = 1 + ssq*(scale/im)*(scale/im)
scale = im
} else {
ssq += (im / scale) * (im / scale)
}
}
}
if math.IsInf(scale, 1) {
return math.Inf(1)
}
return scale * math.Sqrt(ssq)
}
// Izamax returns the index of the first element of x having largest |Re(·)|+|Im(·)|.
// Izamax returns -1 if n is 0 or incX is negative.
func (Implementation) Izamax(n int, x []complex128, incX int) int {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
// Return invalid index.
return -1
}
if n < 1 {
if n == 0 {
// Return invalid index.
return -1
}
panic(nLT0)
}
if len(x) <= (n-1)*incX {
panic(shortX)
}
idx := 0
max := dcabs1(x[0])
if incX == 1 {
for i, v := range x[1:n] {
absV := dcabs1(v)
if absV > max {
max = absV
idx = i + 1
}
}
return idx
}
ix := incX
for i := 1; i < n; i++ {
absV := dcabs1(x[ix])
if absV > max {
max = absV
idx = i
}
ix += incX
}
return idx
}
// Zaxpy adds alpha times x to y:
//
// y[i] += alpha * x[i] for all i
func (Implementation) Zaxpy(n int, alpha complex128, x []complex128, incX int, y []complex128, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
panic(shortX)
}
if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
panic(shortY)
}
if alpha == 0 {
return
}
if incX == 1 && incY == 1 {
c128.AxpyUnitary(alpha, x[:n], y[:n])
return
}
var ix, iy int
if incX < 0 {
ix = (1 - n) * incX
}
if incY < 0 {
iy = (1 - n) * incY
}
c128.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}
// Zcopy copies the vector x to vector y.
func (Implementation) Zcopy(n int, x []complex128, incX int, y []complex128, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
panic(shortX)
}
if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
panic(shortY)
}
if incX == 1 && incY == 1 {
copy(y[:n], x[:n])
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
y[iy] = x[ix]
ix += incX
iy += incY
}
}
// Zdotc computes the dot product
//
// xᴴ · y
//
// of two complex vectors x and y.
func (Implementation) Zdotc(n int, x []complex128, incX int, y []complex128, incY int) complex128 {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n <= 0 {
if n == 0 {
return 0
}
panic(nLT0)
}
if incX == 1 && incY == 1 {
if len(x) < n {
panic(shortX)
}
if len(y) < n {
panic(shortY)
}
return c128.DotcUnitary(x[:n], y[:n])
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
if ix >= len(x) || (n-1)*incX >= len(x) {
panic(shortX)
}
if iy >= len(y) || (n-1)*incY >= len(y) {
panic(shortY)
}
return c128.DotcInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}
// Zdotu computes the dot product
//
// xᵀ · y
//
// of two complex vectors x and y.
func (Implementation) Zdotu(n int, x []complex128, incX int, y []complex128, incY int) complex128 {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n <= 0 {
if n == 0 {
return 0
}
panic(nLT0)
}
if incX == 1 && incY == 1 {
if len(x) < n {
panic(shortX)
}
if len(y) < n {
panic(shortY)
}
return c128.DotuUnitary(x[:n], y[:n])
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
if ix >= len(x) || (n-1)*incX >= len(x) {
panic(shortX)
}
if iy >= len(y) || (n-1)*incY >= len(y) {
panic(shortY)
}
return c128.DotuInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}
// Zdscal scales the vector x by a real scalar alpha.
// Zdscal has no effect if incX < 0.
func (Implementation) Zdscal(n int, alpha float64, x []complex128, incX int) {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return
}
if (n-1)*incX >= len(x) {
panic(shortX)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if alpha == 0 {
if incX == 1 {
x = x[:n]
for i := range x {
x[i] = 0
}
return
}
for ix := 0; ix < n*incX; ix += incX {
x[ix] = 0
}
return
}
if incX == 1 {
x = x[:n]
for i, v := range x {
x[i] = complex(alpha*real(v), alpha*imag(v))
}
return
}
for ix := 0; ix < n*incX; ix += incX {
v := x[ix]
x[ix] = complex(alpha*real(v), alpha*imag(v))
}
}
// Zscal scales the vector x by a complex scalar alpha.
// Zscal has no effect if incX < 0.
func (Implementation) Zscal(n int, alpha complex128, x []complex128, incX int) {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return
}
if (n-1)*incX >= len(x) {
panic(shortX)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if alpha == 0 {
if incX == 1 {
x = x[:n]
for i := range x {
x[i] = 0
}
return
}
for ix := 0; ix < n*incX; ix += incX {
x[ix] = 0
}
return
}
if incX == 1 {
c128.ScalUnitary(alpha, x[:n])
return
}
c128.ScalInc(alpha, x, uintptr(n), uintptr(incX))
}
// Zswap exchanges the elements of two complex vectors x and y.
func (Implementation) Zswap(n int, x []complex128, incX int, y []complex128, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
panic(shortX)
}
if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
panic(shortY)
}
if incX == 1 && incY == 1 {
x = x[:n]
for i, v := range x {
x[i], y[i] = y[i], v
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
x[ix], y[iy] = y[iy], x[ix]
ix += incX
iy += incY
}
}

476
vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go generated vendored Normal file
View File

@@ -0,0 +1,476 @@
// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
math "gonum.org/v1/gonum/internal/math32"
"gonum.org/v1/gonum/blas"
"gonum.org/v1/gonum/internal/asm/c64"
)
var _ blas.Complex64Level1 = Implementation{}
// Scasum returns the sum of the absolute values of the elements of x
//
// \sum_i |Re(x[i])| + |Im(x[i])|
//
// Scasum returns 0 if incX is negative.
//
// Complex64 implementations are autogenerated and not directly tested.
func (Implementation) Scasum(n int, x []complex64, incX int) float32 {
if n < 0 {
panic(nLT0)
}
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return 0
}
var sum float32
if incX == 1 {
if len(x) < n {
panic(shortX)
}
for _, v := range x[:n] {
sum += scabs1(v)
}
return sum
}
if (n-1)*incX >= len(x) {
panic(shortX)
}
for i := 0; i < n; i++ {
v := x[i*incX]
sum += scabs1(v)
}
return sum
}
// Scnrm2 computes the Euclidean norm of the complex vector x,
//
// ‖x‖_2 = sqrt(\sum_i x[i] * conj(x[i])).
//
// This function returns 0 if incX is negative.
//
// Complex64 implementations are autogenerated and not directly tested.
func (Implementation) Scnrm2(n int, x []complex64, incX int) float32 {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return 0
}
if n < 1 {
if n == 0 {
return 0
}
panic(nLT0)
}
if (n-1)*incX >= len(x) {
panic(shortX)
}
var (
scale float32
ssq float32 = 1
)
if incX == 1 {
for _, v := range x[:n] {
re, im := math.Abs(real(v)), math.Abs(imag(v))
if re != 0 {
if re > scale {
ssq = 1 + ssq*(scale/re)*(scale/re)
scale = re
} else {
ssq += (re / scale) * (re / scale)
}
}
if im != 0 {
if im > scale {
ssq = 1 + ssq*(scale/im)*(scale/im)
scale = im
} else {
ssq += (im / scale) * (im / scale)
}
}
}
if math.IsInf(scale, 1) {
return math.Inf(1)
}
return scale * math.Sqrt(ssq)
}
for ix := 0; ix < n*incX; ix += incX {
re, im := math.Abs(real(x[ix])), math.Abs(imag(x[ix]))
if re != 0 {
if re > scale {
ssq = 1 + ssq*(scale/re)*(scale/re)
scale = re
} else {
ssq += (re / scale) * (re / scale)
}
}
if im != 0 {
if im > scale {
ssq = 1 + ssq*(scale/im)*(scale/im)
scale = im
} else {
ssq += (im / scale) * (im / scale)
}
}
}
if math.IsInf(scale, 1) {
return math.Inf(1)
}
return scale * math.Sqrt(ssq)
}
// Icamax returns the index of the first element of x having largest |Re(·)|+|Im(·)|.
// Icamax returns -1 if n is 0 or incX is negative.
//
// Complex64 implementations are autogenerated and not directly tested.
func (Implementation) Icamax(n int, x []complex64, incX int) int {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
// Return invalid index.
return -1
}
if n < 1 {
if n == 0 {
// Return invalid index.
return -1
}
panic(nLT0)
}
if len(x) <= (n-1)*incX {
panic(shortX)
}
idx := 0
max := scabs1(x[0])
if incX == 1 {
for i, v := range x[1:n] {
absV := scabs1(v)
if absV > max {
max = absV
idx = i + 1
}
}
return idx
}
ix := incX
for i := 1; i < n; i++ {
absV := scabs1(x[ix])
if absV > max {
max = absV
idx = i
}
ix += incX
}
return idx
}
// Caxpy adds alpha times x to y:
//
// y[i] += alpha * x[i] for all i
//
// Complex64 implementations are autogenerated and not directly tested.
func (Implementation) Caxpy(n int, alpha complex64, x []complex64, incX int, y []complex64, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
panic(shortX)
}
if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
panic(shortY)
}
if alpha == 0 {
return
}
if incX == 1 && incY == 1 {
c64.AxpyUnitary(alpha, x[:n], y[:n])
return
}
var ix, iy int
if incX < 0 {
ix = (1 - n) * incX
}
if incY < 0 {
iy = (1 - n) * incY
}
c64.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}
// Ccopy copies the vector x to vector y.
//
// Complex64 implementations are autogenerated and not directly tested.
func (Implementation) Ccopy(n int, x []complex64, incX int, y []complex64, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
panic(shortX)
}
if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
panic(shortY)
}
if incX == 1 && incY == 1 {
copy(y[:n], x[:n])
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
y[iy] = x[ix]
ix += incX
iy += incY
}
}
// Cdotc computes the dot product
//
// xᴴ · y
//
// of two complex vectors x and y.
//
// Complex64 implementations are autogenerated and not directly tested.
func (Implementation) Cdotc(n int, x []complex64, incX int, y []complex64, incY int) complex64 {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n <= 0 {
if n == 0 {
return 0
}
panic(nLT0)
}
if incX == 1 && incY == 1 {
if len(x) < n {
panic(shortX)
}
if len(y) < n {
panic(shortY)
}
return c64.DotcUnitary(x[:n], y[:n])
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
if ix >= len(x) || (n-1)*incX >= len(x) {
panic(shortX)
}
if iy >= len(y) || (n-1)*incY >= len(y) {
panic(shortY)
}
return c64.DotcInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}
// Cdotu computes the dot product
//
// xᵀ · y
//
// of two complex vectors x and y.
//
// Complex64 implementations are autogenerated and not directly tested.
func (Implementation) Cdotu(n int, x []complex64, incX int, y []complex64, incY int) complex64 {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n <= 0 {
if n == 0 {
return 0
}
panic(nLT0)
}
if incX == 1 && incY == 1 {
if len(x) < n {
panic(shortX)
}
if len(y) < n {
panic(shortY)
}
return c64.DotuUnitary(x[:n], y[:n])
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
if ix >= len(x) || (n-1)*incX >= len(x) {
panic(shortX)
}
if iy >= len(y) || (n-1)*incY >= len(y) {
panic(shortY)
}
return c64.DotuInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}
// Csscal scales the vector x by a real scalar alpha.
// Csscal has no effect if incX < 0.
//
// Complex64 implementations are autogenerated and not directly tested.
func (Implementation) Csscal(n int, alpha float32, x []complex64, incX int) {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return
}
if (n-1)*incX >= len(x) {
panic(shortX)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if alpha == 0 {
if incX == 1 {
x = x[:n]
for i := range x {
x[i] = 0
}
return
}
for ix := 0; ix < n*incX; ix += incX {
x[ix] = 0
}
return
}
if incX == 1 {
x = x[:n]
for i, v := range x {
x[i] = complex(alpha*real(v), alpha*imag(v))
}
return
}
for ix := 0; ix < n*incX; ix += incX {
v := x[ix]
x[ix] = complex(alpha*real(v), alpha*imag(v))
}
}
// Cscal scales the vector x by a complex scalar alpha.
// Cscal has no effect if incX < 0.
//
// Complex64 implementations are autogenerated and not directly tested.
func (Implementation) Cscal(n int, alpha complex64, x []complex64, incX int) {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return
}
if (n-1)*incX >= len(x) {
panic(shortX)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if alpha == 0 {
if incX == 1 {
x = x[:n]
for i := range x {
x[i] = 0
}
return
}
for ix := 0; ix < n*incX; ix += incX {
x[ix] = 0
}
return
}
if incX == 1 {
c64.ScalUnitary(alpha, x[:n])
return
}
c64.ScalInc(alpha, x, uintptr(n), uintptr(incX))
}
// Cswap exchanges the elements of two complex vectors x and y.
//
// Complex64 implementations are autogenerated and not directly tested.
func (Implementation) Cswap(n int, x []complex64, incX int, y []complex64, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
panic(shortX)
}
if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
panic(shortY)
}
if incX == 1 && incY == 1 {
x = x[:n]
for i, v := range x {
x[i], y[i] = y[i], v
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
x[ix], y[iy] = y[iy], x[ix]
ix += incX
iy += incY
}
}

653
vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go generated vendored Normal file
View File

@@ -0,0 +1,653 @@
// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
math "gonum.org/v1/gonum/internal/math32"
"gonum.org/v1/gonum/blas"
"gonum.org/v1/gonum/internal/asm/f32"
)
var _ blas.Float32Level1 = Implementation{}
// Snrm2 computes the Euclidean norm of a vector,
//
// sqrt(\sum_i x[i] * x[i]).
//
// This function returns 0 if incX is negative.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Snrm2(n int, x []float32, incX int) float32 {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return 0
}
if len(x) <= (n-1)*incX {
panic(shortX)
}
if n < 2 {
if n == 1 {
return math.Abs(x[0])
}
if n == 0 {
return 0
}
panic(nLT0)
}
if incX == 1 {
return f32.L2NormUnitary(x[:n])
}
return f32.L2NormInc(x, uintptr(n), uintptr(incX))
}
// Sasum computes the sum of the absolute values of the elements of x.
//
// \sum_i |x[i]|
//
// Sasum returns 0 if incX is negative.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Sasum(n int, x []float32, incX int) float32 {
var sum float32
if n < 0 {
panic(nLT0)
}
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return 0
}
if len(x) <= (n-1)*incX {
panic(shortX)
}
if incX == 1 {
x = x[:n]
for _, v := range x {
sum += math.Abs(v)
}
return sum
}
for i := 0; i < n; i++ {
sum += math.Abs(x[i*incX])
}
return sum
}
// Isamax returns the index of an element of x with the largest absolute value.
// If there are multiple such indices the earliest is returned.
// Isamax returns -1 if n == 0.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Isamax(n int, x []float32, incX int) int {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return -1
}
if len(x) <= (n-1)*incX {
panic(shortX)
}
if n < 2 {
if n == 1 {
return 0
}
if n == 0 {
return -1 // Netlib returns invalid index when n == 0.
}
panic(nLT0)
}
idx := 0
max := math.Abs(x[0])
if incX == 1 {
for i, v := range x[:n] {
absV := math.Abs(v)
if absV > max {
max = absV
idx = i
}
}
return idx
}
ix := incX
for i := 1; i < n; i++ {
v := x[ix]
absV := math.Abs(v)
if absV > max {
max = absV
idx = i
}
ix += incX
}
return idx
}
// Sswap exchanges the elements of two vectors.
//
// x[i], y[i] = y[i], x[i] for all i
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Sswap(n int, x []float32, incX int, y []float32, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
panic(shortX)
}
if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
panic(shortY)
}
if incX == 1 && incY == 1 {
x = x[:n]
for i, v := range x {
x[i], y[i] = y[i], v
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
x[ix], y[iy] = y[iy], x[ix]
ix += incX
iy += incY
}
}
// Scopy copies the elements of x into the elements of y.
//
// y[i] = x[i] for all i
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Scopy(n int, x []float32, incX int, y []float32, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
panic(shortX)
}
if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
panic(shortY)
}
if incX == 1 && incY == 1 {
copy(y[:n], x[:n])
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
y[iy] = x[ix]
ix += incX
iy += incY
}
}
// Saxpy adds alpha times x to y
//
// y[i] += alpha * x[i] for all i
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Saxpy(n int, alpha float32, x []float32, incX int, y []float32, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
panic(shortX)
}
if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
panic(shortY)
}
if alpha == 0 {
return
}
if incX == 1 && incY == 1 {
f32.AxpyUnitary(alpha, x[:n], y[:n])
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
f32.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}
// Srotg computes a plane rotation
//
// ⎡ c s ⎤ ⎡ a ⎤ = ⎡ r ⎤
// ⎣ -s c ⎦ ⎣ b ⎦ ⎣ 0 ⎦
//
// satisfying c^2 + s^2 = 1.
//
// The computation uses the formulas
//
// sigma = sgn(a) if |a| > |b|
// = sgn(b) if |b| >= |a|
// r = sigma*sqrt(a^2 + b^2)
// c = 1; s = 0 if r = 0
// c = a/r; s = b/r if r != 0
// c >= 0 if |a| > |b|
//
// The subroutine also computes
//
// z = s if |a| > |b|,
// = 1/c if |b| >= |a| and c != 0
// = 1 if c = 0
//
// This allows c and s to be reconstructed from z as follows:
//
// If z = 1, set c = 0, s = 1.
// If |z| < 1, set c = sqrt(1 - z^2) and s = z.
// If |z| > 1, set c = 1/z and s = sqrt(1 - c^2).
//
// NOTE: There is a discrepancy between the reference implementation and the
// BLAS technical manual regarding the sign for r when a or b are zero. Drotg
// agrees with the definition in the manual and other common BLAS
// implementations.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Srotg(a, b float32) (c, s, r, z float32) {
// Implementation based on Supplemental Material to:
// Edward Anderson. 2017. Algorithm 978: Safe Scaling in the Level 1 BLAS.
// ACM Trans. Math. Softw. 44, 1, Article 12 (July 2017), 28 pages.
// DOI: https://doi.org/10.1145/3061665
const (
safmin = 0x1p-126
safmax = 1 / safmin
)
anorm := math.Abs(a)
bnorm := math.Abs(b)
switch {
case bnorm == 0:
c = 1
s = 0
r = a
z = 0
case anorm == 0:
c = 0
s = 1
r = b
z = 1
default:
maxab := math.Max(anorm, bnorm)
scl := math.Min(math.Max(safmin, maxab), safmax)
var sigma float32
if anorm > bnorm {
sigma = math.Copysign(1, a)
} else {
sigma = math.Copysign(1, b)
}
ascl := a / scl
bscl := b / scl
r = sigma * (scl * math.Sqrt(ascl*ascl+bscl*bscl))
c = a / r
s = b / r
switch {
case anorm > bnorm:
z = s
case c != 0:
z = 1 / c
default:
z = 1
}
}
return c, s, r, z
}
// Srotmg computes the modified Givens rotation. See
// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
// for more details.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Srotmg(d1, d2, x1, y1 float32) (p blas.SrotmParams, rd1, rd2, rx1 float32) {
// The implementation of Drotmg used here is taken from Hopkins 1997
// Appendix A: https://doi.org/10.1145/289251.289253
// with the exception of the gam constants below.
const (
gam = 4096.0
gamsq = gam * gam
rgamsq = 1.0 / gamsq
)
if d1 < 0 {
p.Flag = blas.Rescaling // Error state.
return p, 0, 0, 0
}
if d2 == 0 || y1 == 0 {
p.Flag = blas.Identity
return p, d1, d2, x1
}
var h11, h12, h21, h22 float32
if (d1 == 0 || x1 == 0) && d2 > 0 {
p.Flag = blas.Diagonal
h12 = 1
h21 = -1
x1 = y1
d1, d2 = d2, d1
} else {
p2 := d2 * y1
p1 := d1 * x1
q2 := p2 * y1
q1 := p1 * x1
if math.Abs(q1) > math.Abs(q2) {
p.Flag = blas.OffDiagonal
h11 = 1
h22 = 1
h21 = -y1 / x1
h12 = p2 / p1
u := 1 - float32(h12*h21)
if u <= 0 {
p.Flag = blas.Rescaling // Error state.
return p, 0, 0, 0
}
d1 /= u
d2 /= u
x1 *= u
} else {
if q2 < 0 {
p.Flag = blas.Rescaling // Error state.
return p, 0, 0, 0
}
p.Flag = blas.Diagonal
h21 = -1
h12 = 1
h11 = p1 / p2
h22 = x1 / y1
u := 1 + float32(h11*h22)
d1, d2 = d2/u, d1/u
x1 = y1 * u
}
}
for d1 <= rgamsq && d1 != 0 {
p.Flag = blas.Rescaling
d1 = (d1 * gam) * gam
x1 /= gam
h11 /= gam
h12 /= gam
}
for d1 > gamsq {
p.Flag = blas.Rescaling
d1 = (d1 / gam) / gam
x1 *= gam
h11 *= gam
h12 *= gam
}
for math.Abs(d2) <= rgamsq && d2 != 0 {
p.Flag = blas.Rescaling
d2 = (d2 * gam) * gam
h21 /= gam
h22 /= gam
}
for math.Abs(d2) > gamsq {
p.Flag = blas.Rescaling
d2 = (d2 / gam) / gam
h21 *= gam
h22 *= gam
}
switch p.Flag {
case blas.Diagonal:
p.H = [4]float32{0: h11, 3: h22}
case blas.OffDiagonal:
p.H = [4]float32{1: h21, 2: h12}
case blas.Rescaling:
p.H = [4]float32{h11, h21, h12, h22}
default:
panic(badFlag)
}
return p, d1, d2, x1
}
// Srot applies a plane transformation.
//
// x[i] = c * x[i] + s * y[i]
// y[i] = c * y[i] - s * x[i]
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Srot(n int, x []float32, incX int, y []float32, incY int, c float32, s float32) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
panic(shortX)
}
if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
panic(shortY)
}
if incX == 1 && incY == 1 {
x = x[:n]
for i, vx := range x {
vy := y[i]
x[i], y[i] = c*vx+s*vy, c*vy-s*vx
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
vx := x[ix]
vy := y[iy]
x[ix], y[iy] = c*vx+s*vy, c*vy-s*vx
ix += incX
iy += incY
}
}
// Srotm applies the modified Givens rotation to the 2×n matrix.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Srotm(n int, x []float32, incX int, y []float32, incY int, p blas.SrotmParams) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n <= 0 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
panic(shortX)
}
if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
panic(shortY)
}
if p.Flag == blas.Identity {
return
}
switch p.Flag {
case blas.Rescaling:
h11 := p.H[0]
h12 := p.H[2]
h21 := p.H[1]
h22 := p.H[3]
if incX == 1 && incY == 1 {
x = x[:n]
for i, vx := range x {
vy := y[i]
x[i], y[i] = float32(vx*h11)+float32(vy*h12), float32(vx*h21)+float32(vy*h22)
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
vx := x[ix]
vy := y[iy]
x[ix], y[iy] = float32(vx*h11)+float32(vy*h12), float32(vx*h21)+float32(vy*h22)
ix += incX
iy += incY
}
case blas.OffDiagonal:
h12 := p.H[2]
h21 := p.H[1]
if incX == 1 && incY == 1 {
x = x[:n]
for i, vx := range x {
vy := y[i]
x[i], y[i] = vx+float32(vy*h12), float32(vx*h21)+vy
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
vx := x[ix]
vy := y[iy]
x[ix], y[iy] = vx+float32(vy*h12), float32(vx*h21)+vy
ix += incX
iy += incY
}
case blas.Diagonal:
h11 := p.H[0]
h22 := p.H[3]
if incX == 1 && incY == 1 {
x = x[:n]
for i, vx := range x {
vy := y[i]
x[i], y[i] = float32(vx*h11)+vy, -vx+float32(vy*h22)
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
vx := x[ix]
vy := y[iy]
x[ix], y[iy] = float32(vx*h11)+vy, -vx+float32(vy*h22)
ix += incX
iy += incY
}
}
}
// Sscal scales x by alpha.
//
// x[i] *= alpha
//
// Sscal has no effect if incX < 0.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Sscal(n int, alpha float32, x []float32, incX int) {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (n-1)*incX >= len(x) {
panic(shortX)
}
if alpha == 0 {
if incX == 1 {
x = x[:n]
for i := range x {
x[i] = 0
}
return
}
for ix := 0; ix < n*incX; ix += incX {
x[ix] = 0
}
return
}
if incX == 1 {
f32.ScalUnitary(alpha, x[:n])
return
}
f32.ScalInc(alpha, x, uintptr(n), uintptr(incX))
}

View File

@@ -0,0 +1,54 @@
// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
"gonum.org/v1/gonum/internal/asm/f32"
)
// Dsdot computes the dot product of the two vectors
//
// \sum_i x[i]*y[i]
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Dsdot(n int, x []float32, incX int, y []float32, incY int) float64 {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n <= 0 {
if n == 0 {
return 0
}
panic(nLT0)
}
if incX == 1 && incY == 1 {
if len(x) < n {
panic(shortX)
}
if len(y) < n {
panic(shortY)
}
return f32.DdotUnitary(x[:n], y[:n])
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
if ix >= len(x) || ix+(n-1)*incX >= len(x) {
panic(shortX)
}
if iy >= len(y) || iy+(n-1)*incY >= len(y) {
panic(shortY)
}
return f32.DdotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}

View File

@@ -0,0 +1,54 @@
// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
"gonum.org/v1/gonum/internal/asm/f32"
)
// Sdot computes the dot product of the two vectors
//
// \sum_i x[i]*y[i]
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Sdot(n int, x []float32, incX int, y []float32, incY int) float32 {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n <= 0 {
if n == 0 {
return 0
}
panic(nLT0)
}
if incX == 1 && incY == 1 {
if len(x) < n {
panic(shortX)
}
if len(y) < n {
panic(shortY)
}
return f32.DotUnitary(x[:n], y[:n])
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
if ix >= len(x) || ix+(n-1)*incX >= len(x) {
panic(shortX)
}
if iy >= len(y) || iy+(n-1)*incY >= len(y) {
panic(shortY)
}
return f32.DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}

View File

@@ -0,0 +1,54 @@
// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
"gonum.org/v1/gonum/internal/asm/f32"
)
// Sdsdot computes the dot product of the two vectors plus a constant
//
// alpha + \sum_i x[i]*y[i]
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Sdsdot(n int, alpha float32, x []float32, incX int, y []float32, incY int) float32 {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n <= 0 {
if n == 0 {
return 0
}
panic(nLT0)
}
if incX == 1 && incY == 1 {
if len(x) < n {
panic(shortX)
}
if len(y) < n {
panic(shortY)
}
return alpha + float32(f32.DdotUnitary(x[:n], y[:n]))
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
if ix >= len(x) || ix+(n-1)*incX >= len(x) {
panic(shortX)
}
if iy >= len(y) || iy+(n-1)*incY >= len(y) {
panic(shortY)
}
return alpha + float32(f32.DdotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy)))
}

629
vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go generated vendored Normal file
View File

@@ -0,0 +1,629 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
"math"
"gonum.org/v1/gonum/blas"
"gonum.org/v1/gonum/internal/asm/f64"
)
var _ blas.Float64Level1 = Implementation{}
// Dnrm2 computes the Euclidean norm of a vector,
//
// sqrt(\sum_i x[i] * x[i]).
//
// This function returns 0 if incX is negative.
func (Implementation) Dnrm2(n int, x []float64, incX int) float64 {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return 0
}
if len(x) <= (n-1)*incX {
panic(shortX)
}
if n < 2 {
if n == 1 {
return math.Abs(x[0])
}
if n == 0 {
return 0
}
panic(nLT0)
}
if incX == 1 {
return f64.L2NormUnitary(x[:n])
}
return f64.L2NormInc(x, uintptr(n), uintptr(incX))
}
// Dasum computes the sum of the absolute values of the elements of x.
//
// \sum_i |x[i]|
//
// Dasum returns 0 if incX is negative.
func (Implementation) Dasum(n int, x []float64, incX int) float64 {
var sum float64
if n < 0 {
panic(nLT0)
}
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return 0
}
if len(x) <= (n-1)*incX {
panic(shortX)
}
if incX == 1 {
x = x[:n]
for _, v := range x {
sum += math.Abs(v)
}
return sum
}
for i := 0; i < n; i++ {
sum += math.Abs(x[i*incX])
}
return sum
}
// Idamax returns the index of an element of x with the largest absolute value.
// If there are multiple such indices the earliest is returned.
// Idamax returns -1 if n == 0.
func (Implementation) Idamax(n int, x []float64, incX int) int {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return -1
}
if len(x) <= (n-1)*incX {
panic(shortX)
}
if n < 2 {
if n == 1 {
return 0
}
if n == 0 {
return -1 // Netlib returns invalid index when n == 0.
}
panic(nLT0)
}
idx := 0
max := math.Abs(x[0])
if incX == 1 {
for i, v := range x[:n] {
absV := math.Abs(v)
if absV > max {
max = absV
idx = i
}
}
return idx
}
ix := incX
for i := 1; i < n; i++ {
v := x[ix]
absV := math.Abs(v)
if absV > max {
max = absV
idx = i
}
ix += incX
}
return idx
}
// Dswap exchanges the elements of two vectors.
//
// x[i], y[i] = y[i], x[i] for all i
func (Implementation) Dswap(n int, x []float64, incX int, y []float64, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
panic(shortX)
}
if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
panic(shortY)
}
if incX == 1 && incY == 1 {
x = x[:n]
for i, v := range x {
x[i], y[i] = y[i], v
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
x[ix], y[iy] = y[iy], x[ix]
ix += incX
iy += incY
}
}
// Dcopy copies the elements of x into the elements of y.
//
// y[i] = x[i] for all i
func (Implementation) Dcopy(n int, x []float64, incX int, y []float64, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
panic(shortX)
}
if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
panic(shortY)
}
if incX == 1 && incY == 1 {
copy(y[:n], x[:n])
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
y[iy] = x[ix]
ix += incX
iy += incY
}
}
// Daxpy adds alpha times x to y
//
// y[i] += alpha * x[i] for all i
func (Implementation) Daxpy(n int, alpha float64, x []float64, incX int, y []float64, incY int) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
panic(shortX)
}
if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
panic(shortY)
}
if alpha == 0 {
return
}
if incX == 1 && incY == 1 {
f64.AxpyUnitary(alpha, x[:n], y[:n])
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
f64.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}
// Drotg computes a plane rotation
//
// ⎡ c s ⎤ ⎡ a ⎤ = ⎡ r ⎤
// ⎣ -s c ⎦ ⎣ b ⎦ ⎣ 0 ⎦
//
// satisfying c^2 + s^2 = 1.
//
// The computation uses the formulas
//
// sigma = sgn(a) if |a| > |b|
// = sgn(b) if |b| >= |a|
// r = sigma*sqrt(a^2 + b^2)
// c = 1; s = 0 if r = 0
// c = a/r; s = b/r if r != 0
// c >= 0 if |a| > |b|
//
// The subroutine also computes
//
// z = s if |a| > |b|,
// = 1/c if |b| >= |a| and c != 0
// = 1 if c = 0
//
// This allows c and s to be reconstructed from z as follows:
//
// If z = 1, set c = 0, s = 1.
// If |z| < 1, set c = sqrt(1 - z^2) and s = z.
// If |z| > 1, set c = 1/z and s = sqrt(1 - c^2).
//
// NOTE: There is a discrepancy between the reference implementation and the
// BLAS technical manual regarding the sign for r when a or b are zero. Drotg
// agrees with the definition in the manual and other common BLAS
// implementations.
func (Implementation) Drotg(a, b float64) (c, s, r, z float64) {
// Implementation based on Supplemental Material to:
// Edward Anderson. 2017. Algorithm 978: Safe Scaling in the Level 1 BLAS.
// ACM Trans. Math. Softw. 44, 1, Article 12 (July 2017), 28 pages.
// DOI: https://doi.org/10.1145/3061665
const (
safmin = 0x1p-1022
safmax = 1 / safmin
)
anorm := math.Abs(a)
bnorm := math.Abs(b)
switch {
case bnorm == 0:
c = 1
s = 0
r = a
z = 0
case anorm == 0:
c = 0
s = 1
r = b
z = 1
default:
maxab := math.Max(anorm, bnorm)
scl := math.Min(math.Max(safmin, maxab), safmax)
var sigma float64
if anorm > bnorm {
sigma = math.Copysign(1, a)
} else {
sigma = math.Copysign(1, b)
}
ascl := a / scl
bscl := b / scl
r = sigma * (scl * math.Sqrt(ascl*ascl+bscl*bscl))
c = a / r
s = b / r
switch {
case anorm > bnorm:
z = s
case c != 0:
z = 1 / c
default:
z = 1
}
}
return c, s, r, z
}
// Drotmg computes the modified Givens rotation. See
// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
// for more details.
func (Implementation) Drotmg(d1, d2, x1, y1 float64) (p blas.DrotmParams, rd1, rd2, rx1 float64) {
// The implementation of Drotmg used here is taken from Hopkins 1997
// Appendix A: https://doi.org/10.1145/289251.289253
// with the exception of the gam constants below.
const (
gam = 4096.0
gamsq = gam * gam
rgamsq = 1.0 / gamsq
)
if d1 < 0 {
p.Flag = blas.Rescaling // Error state.
return p, 0, 0, 0
}
if d2 == 0 || y1 == 0 {
p.Flag = blas.Identity
return p, d1, d2, x1
}
var h11, h12, h21, h22 float64
if (d1 == 0 || x1 == 0) && d2 > 0 {
p.Flag = blas.Diagonal
h12 = 1
h21 = -1
x1 = y1
d1, d2 = d2, d1
} else {
p2 := d2 * y1
p1 := d1 * x1
q2 := p2 * y1
q1 := p1 * x1
if math.Abs(q1) > math.Abs(q2) {
p.Flag = blas.OffDiagonal
h11 = 1
h22 = 1
h21 = -y1 / x1
h12 = p2 / p1
u := 1 - float64(h12*h21)
if u <= 0 {
p.Flag = blas.Rescaling // Error state.
return p, 0, 0, 0
}
d1 /= u
d2 /= u
x1 *= u
} else {
if q2 < 0 {
p.Flag = blas.Rescaling // Error state.
return p, 0, 0, 0
}
p.Flag = blas.Diagonal
h21 = -1
h12 = 1
h11 = p1 / p2
h22 = x1 / y1
u := 1 + float64(h11*h22)
d1, d2 = d2/u, d1/u
x1 = y1 * u
}
}
for d1 <= rgamsq && d1 != 0 {
p.Flag = blas.Rescaling
d1 = (d1 * gam) * gam
x1 /= gam
h11 /= gam
h12 /= gam
}
for d1 > gamsq {
p.Flag = blas.Rescaling
d1 = (d1 / gam) / gam
x1 *= gam
h11 *= gam
h12 *= gam
}
for math.Abs(d2) <= rgamsq && d2 != 0 {
p.Flag = blas.Rescaling
d2 = (d2 * gam) * gam
h21 /= gam
h22 /= gam
}
for math.Abs(d2) > gamsq {
p.Flag = blas.Rescaling
d2 = (d2 / gam) / gam
h21 *= gam
h22 *= gam
}
switch p.Flag {
case blas.Diagonal:
p.H = [4]float64{0: h11, 3: h22}
case blas.OffDiagonal:
p.H = [4]float64{1: h21, 2: h12}
case blas.Rescaling:
p.H = [4]float64{h11, h21, h12, h22}
default:
panic(badFlag)
}
return p, d1, d2, x1
}
// Drot applies a plane transformation.
//
// x[i] = c * x[i] + s * y[i]
// y[i] = c * y[i] - s * x[i]
func (Implementation) Drot(n int, x []float64, incX int, y []float64, incY int, c float64, s float64) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
panic(shortX)
}
if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
panic(shortY)
}
if incX == 1 && incY == 1 {
x = x[:n]
for i, vx := range x {
vy := y[i]
x[i], y[i] = c*vx+s*vy, c*vy-s*vx
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
vx := x[ix]
vy := y[iy]
x[ix], y[iy] = c*vx+s*vy, c*vy-s*vx
ix += incX
iy += incY
}
}
// Drotm applies the modified Givens rotation to the 2×n matrix.
func (Implementation) Drotm(n int, x []float64, incX int, y []float64, incY int, p blas.DrotmParams) {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n <= 0 {
if n == 0 {
return
}
panic(nLT0)
}
if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
panic(shortX)
}
if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
panic(shortY)
}
if p.Flag == blas.Identity {
return
}
switch p.Flag {
case blas.Rescaling:
h11 := p.H[0]
h12 := p.H[2]
h21 := p.H[1]
h22 := p.H[3]
if incX == 1 && incY == 1 {
x = x[:n]
for i, vx := range x {
vy := y[i]
x[i], y[i] = float64(vx*h11)+float64(vy*h12), float64(vx*h21)+float64(vy*h22)
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
vx := x[ix]
vy := y[iy]
x[ix], y[iy] = float64(vx*h11)+float64(vy*h12), float64(vx*h21)+float64(vy*h22)
ix += incX
iy += incY
}
case blas.OffDiagonal:
h12 := p.H[2]
h21 := p.H[1]
if incX == 1 && incY == 1 {
x = x[:n]
for i, vx := range x {
vy := y[i]
x[i], y[i] = vx+float64(vy*h12), float64(vx*h21)+vy
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
vx := x[ix]
vy := y[iy]
x[ix], y[iy] = vx+float64(vy*h12), float64(vx*h21)+vy
ix += incX
iy += incY
}
case blas.Diagonal:
h11 := p.H[0]
h22 := p.H[3]
if incX == 1 && incY == 1 {
x = x[:n]
for i, vx := range x {
vy := y[i]
x[i], y[i] = float64(vx*h11)+vy, -vx+float64(vy*h22)
}
return
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
for i := 0; i < n; i++ {
vx := x[ix]
vy := y[iy]
x[ix], y[iy] = float64(vx*h11)+vy, -vx+float64(vy*h22)
ix += incX
iy += incY
}
}
}
// Dscal scales x by alpha.
//
// x[i] *= alpha
//
// Dscal has no effect if incX < 0.
func (Implementation) Dscal(n int, alpha float64, x []float64, incX int) {
if incX < 1 {
if incX == 0 {
panic(zeroIncX)
}
return
}
if n < 1 {
if n == 0 {
return
}
panic(nLT0)
}
if (n-1)*incX >= len(x) {
panic(shortX)
}
if alpha == 0 {
if incX == 1 {
x = x[:n]
for i := range x {
x[i] = 0
}
return
}
for ix := 0; ix < n*incX; ix += incX {
x[ix] = 0
}
return
}
if incX == 1 {
f64.ScalUnitary(alpha, x[:n])
return
}
f64.ScalInc(alpha, x, uintptr(n), uintptr(incX))
}

View File

@@ -0,0 +1,50 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
"gonum.org/v1/gonum/internal/asm/f64"
)
// Ddot computes the dot product of the two vectors
//
// \sum_i x[i]*y[i]
func (Implementation) Ddot(n int, x []float64, incX int, y []float64, incY int) float64 {
if incX == 0 {
panic(zeroIncX)
}
if incY == 0 {
panic(zeroIncY)
}
if n <= 0 {
if n == 0 {
return 0
}
panic(nLT0)
}
if incX == 1 && incY == 1 {
if len(x) < n {
panic(shortX)
}
if len(y) < n {
panic(shortY)
}
return f64.DotUnitary(x[:n], y[:n])
}
var ix, iy int
if incX < 0 {
ix = (-n + 1) * incX
}
if incY < 0 {
iy = (-n + 1) * incY
}
if ix >= len(x) || ix+(n-1)*incX >= len(x) {
panic(shortX)
}
if iy >= len(y) || iy+(n-1)*incY >= len(y) {
panic(shortY)
}
return f64.DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
}

2940
vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

2976
vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

2400
vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

2366
vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

1751
vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

1771
vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go generated vendored Normal file

File diff suppressed because it is too large Load Diff

925
vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go generated vendored Normal file
View File

@@ -0,0 +1,925 @@
// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
// Copyright ©2014 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
"gonum.org/v1/gonum/blas"
"gonum.org/v1/gonum/internal/asm/f32"
)
var _ blas.Float32Level3 = Implementation{}
// Strsm solves one of the matrix equations
//
// A * X = alpha * B if tA == blas.NoTrans and side == blas.Left
// Aᵀ * X = alpha * B if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
// X * A = alpha * B if tA == blas.NoTrans and side == blas.Right
// X * Aᵀ = alpha * B if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
//
// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
// scalar.
//
// At entry to the function, X contains the values of B, and the result is
// stored in-place into X.
//
// No check is made that A is invertible.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Strsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
if s != blas.Left && s != blas.Right {
panic(badSide)
}
if ul != blas.Lower && ul != blas.Upper {
panic(badUplo)
}
if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
panic(badTranspose)
}
if d != blas.NonUnit && d != blas.Unit {
panic(badDiag)
}
if m < 0 {
panic(mLT0)
}
if n < 0 {
panic(nLT0)
}
k := n
if s == blas.Left {
k = m
}
if lda < max(1, k) {
panic(badLdA)
}
if ldb < max(1, n) {
panic(badLdB)
}
// Quick return if possible.
if m == 0 || n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if len(a) < lda*(k-1)+k {
panic(shortA)
}
if len(b) < ldb*(m-1)+n {
panic(shortB)
}
if alpha == 0 {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j := range btmp {
btmp[j] = 0
}
}
return
}
nonUnit := d == blas.NonUnit
if s == blas.Left {
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := m - 1; i >= 0; i-- {
btmp := b[i*ldb : i*ldb+n]
if alpha != 1 {
f32.ScalUnitary(alpha, btmp)
}
for ka, va := range a[i*lda+i+1 : i*lda+m] {
if va != 0 {
k := ka + i + 1
f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
}
}
if nonUnit {
tmp := 1 / a[i*lda+i]
f32.ScalUnitary(tmp, btmp)
}
}
return
}
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
if alpha != 1 {
f32.ScalUnitary(alpha, btmp)
}
for k, va := range a[i*lda : i*lda+i] {
if va != 0 {
f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
}
}
if nonUnit {
tmp := 1 / a[i*lda+i]
f32.ScalUnitary(tmp, btmp)
}
}
return
}
// Cases where a is transposed
if ul == blas.Upper {
for k := 0; k < m; k++ {
btmpk := b[k*ldb : k*ldb+n]
if nonUnit {
tmp := 1 / a[k*lda+k]
f32.ScalUnitary(tmp, btmpk)
}
for ia, va := range a[k*lda+k+1 : k*lda+m] {
if va != 0 {
i := ia + k + 1
f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
}
}
if alpha != 1 {
f32.ScalUnitary(alpha, btmpk)
}
}
return
}
for k := m - 1; k >= 0; k-- {
btmpk := b[k*ldb : k*ldb+n]
if nonUnit {
tmp := 1 / a[k*lda+k]
f32.ScalUnitary(tmp, btmpk)
}
for i, va := range a[k*lda : k*lda+k] {
if va != 0 {
f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
}
}
if alpha != 1 {
f32.ScalUnitary(alpha, btmpk)
}
}
return
}
// Cases where a is to the right of X.
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
if alpha != 1 {
f32.ScalUnitary(alpha, btmp)
}
for k, vb := range btmp {
if vb == 0 {
continue
}
if nonUnit {
btmp[k] /= a[k*lda+k]
}
f32.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
}
}
return
}
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
if alpha != 1 {
f32.ScalUnitary(alpha, btmp)
}
for k := n - 1; k >= 0; k-- {
if btmp[k] == 0 {
continue
}
if nonUnit {
btmp[k] /= a[k*lda+k]
}
f32.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
}
}
return
}
// Cases where a is transposed.
if ul == blas.Upper {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j := n - 1; j >= 0; j-- {
tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
if nonUnit {
tmp /= a[j*lda+j]
}
btmp[j] = tmp
}
}
return
}
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j := 0; j < n; j++ {
tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
if nonUnit {
tmp /= a[j*lda+j]
}
btmp[j] = tmp
}
}
}
// Ssymm performs one of the matrix-matrix operations
//
// C = alpha * A * B + beta * C if side == blas.Left
// C = alpha * B * A + beta * C if side == blas.Right
//
// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
// is a scalar.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Ssymm(s blas.Side, ul blas.Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
if s != blas.Right && s != blas.Left {
panic(badSide)
}
if ul != blas.Lower && ul != blas.Upper {
panic(badUplo)
}
if m < 0 {
panic(mLT0)
}
if n < 0 {
panic(nLT0)
}
k := n
if s == blas.Left {
k = m
}
if lda < max(1, k) {
panic(badLdA)
}
if ldb < max(1, n) {
panic(badLdB)
}
if ldc < max(1, n) {
panic(badLdC)
}
// Quick return if possible.
if m == 0 || n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if len(a) < lda*(k-1)+k {
panic(shortA)
}
if len(b) < ldb*(m-1)+n {
panic(shortB)
}
if len(c) < ldc*(m-1)+n {
panic(shortC)
}
// Quick return if possible.
if alpha == 0 && beta == 1 {
return
}
if beta == 0 {
for i := 0; i < m; i++ {
ctmp := c[i*ldc : i*ldc+n]
for j := range ctmp {
ctmp[j] = 0
}
}
}
if alpha == 0 {
if beta != 0 {
for i := 0; i < m; i++ {
ctmp := c[i*ldc : i*ldc+n]
for j := 0; j < n; j++ {
ctmp[j] *= beta
}
}
}
return
}
isUpper := ul == blas.Upper
if s == blas.Left {
for i := 0; i < m; i++ {
atmp := alpha * a[i*lda+i]
btmp := b[i*ldb : i*ldb+n]
ctmp := c[i*ldc : i*ldc+n]
for j, v := range btmp {
ctmp[j] *= beta
ctmp[j] += atmp * v
}
for k := 0; k < i; k++ {
var atmp float32
if isUpper {
atmp = a[k*lda+i]
} else {
atmp = a[i*lda+k]
}
atmp *= alpha
f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
}
for k := i + 1; k < m; k++ {
var atmp float32
if isUpper {
atmp = a[i*lda+k]
} else {
atmp = a[k*lda+i]
}
atmp *= alpha
f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
}
}
return
}
if isUpper {
for i := 0; i < m; i++ {
for j := n - 1; j >= 0; j-- {
tmp := alpha * b[i*ldb+j]
var tmp2 float32
atmp := a[j*lda+j+1 : j*lda+n]
btmp := b[i*ldb+j+1 : i*ldb+n]
ctmp := c[i*ldc+j+1 : i*ldc+n]
for k, v := range atmp {
ctmp[k] += tmp * v
tmp2 += btmp[k] * v
}
c[i*ldc+j] *= beta
c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
}
}
return
}
for i := 0; i < m; i++ {
for j := 0; j < n; j++ {
tmp := alpha * b[i*ldb+j]
var tmp2 float32
atmp := a[j*lda : j*lda+j]
btmp := b[i*ldb : i*ldb+j]
ctmp := c[i*ldc : i*ldc+j]
for k, v := range atmp {
ctmp[k] += tmp * v
tmp2 += btmp[k] * v
}
c[i*ldc+j] *= beta
c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
}
}
}
// Ssyrk performs one of the symmetric rank-k operations
//
// C = alpha * A * Aᵀ + beta * C if tA == blas.NoTrans
// C = alpha * Aᵀ * A + beta * C if tA == blas.Trans or tA == blas.ConjTrans
//
// where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
// beta are scalars.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Ssyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int) {
if ul != blas.Lower && ul != blas.Upper {
panic(badUplo)
}
if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
panic(badTranspose)
}
if n < 0 {
panic(nLT0)
}
if k < 0 {
panic(kLT0)
}
row, col := k, n
if tA == blas.NoTrans {
row, col = n, k
}
if lda < max(1, col) {
panic(badLdA)
}
if ldc < max(1, n) {
panic(badLdC)
}
// Quick return if possible.
if n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if len(a) < lda*(row-1)+col {
panic(shortA)
}
if len(c) < ldc*(n-1)+n {
panic(shortC)
}
if alpha == 0 {
if beta == 0 {
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
for j := range ctmp {
ctmp[j] = 0
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
for j := range ctmp {
ctmp[j] = 0
}
}
return
}
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
for j := range ctmp {
ctmp[j] *= beta
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
for j := range ctmp {
ctmp[j] *= beta
}
}
return
}
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
atmp := a[i*lda : i*lda+k]
if beta == 0 {
for jc := range ctmp {
j := jc + i
ctmp[jc] = alpha * f32.DotUnitary(atmp, a[j*lda:j*lda+k])
}
} else {
for jc, vc := range ctmp {
j := jc + i
ctmp[jc] = vc*beta + alpha*f32.DotUnitary(atmp, a[j*lda:j*lda+k])
}
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
atmp := a[i*lda : i*lda+k]
if beta == 0 {
for j := range ctmp {
ctmp[j] = alpha * f32.DotUnitary(a[j*lda:j*lda+k], atmp)
}
} else {
for j, vc := range ctmp {
ctmp[j] = vc*beta + alpha*f32.DotUnitary(a[j*lda:j*lda+k], atmp)
}
}
}
return
}
// Cases where a is transposed.
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
if beta == 0 {
for j := range ctmp {
ctmp[j] = 0
}
} else if beta != 1 {
for j := range ctmp {
ctmp[j] *= beta
}
}
for l := 0; l < k; l++ {
tmp := alpha * a[l*lda+i]
if tmp != 0 {
f32.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
}
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
if beta != 1 {
for j := range ctmp {
ctmp[j] *= beta
}
}
for l := 0; l < k; l++ {
tmp := alpha * a[l*lda+i]
if tmp != 0 {
f32.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
}
}
}
}
// Ssyr2k performs one of the symmetric rank 2k operations
//
// C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C if tA == blas.NoTrans
// C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C if tA == blas.Trans or tA == blas.ConjTrans
//
// where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
// alpha and beta are scalars.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Ssyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
if ul != blas.Lower && ul != blas.Upper {
panic(badUplo)
}
if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
panic(badTranspose)
}
if n < 0 {
panic(nLT0)
}
if k < 0 {
panic(kLT0)
}
row, col := k, n
if tA == blas.NoTrans {
row, col = n, k
}
if lda < max(1, col) {
panic(badLdA)
}
if ldb < max(1, col) {
panic(badLdB)
}
if ldc < max(1, n) {
panic(badLdC)
}
// Quick return if possible.
if n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if len(a) < lda*(row-1)+col {
panic(shortA)
}
if len(b) < ldb*(row-1)+col {
panic(shortB)
}
if len(c) < ldc*(n-1)+n {
panic(shortC)
}
if alpha == 0 {
if beta == 0 {
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
for j := range ctmp {
ctmp[j] = 0
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
for j := range ctmp {
ctmp[j] = 0
}
}
return
}
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
for j := range ctmp {
ctmp[j] *= beta
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
for j := range ctmp {
ctmp[j] *= beta
}
}
return
}
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := 0; i < n; i++ {
atmp := a[i*lda : i*lda+k]
btmp := b[i*ldb : i*ldb+k]
ctmp := c[i*ldc+i : i*ldc+n]
if beta == 0 {
for jc := range ctmp {
j := i + jc
var tmp1, tmp2 float32
binner := b[j*ldb : j*ldb+k]
for l, v := range a[j*lda : j*lda+k] {
tmp1 += v * btmp[l]
tmp2 += atmp[l] * binner[l]
}
ctmp[jc] = alpha * (tmp1 + tmp2)
}
} else {
for jc := range ctmp {
j := i + jc
var tmp1, tmp2 float32
binner := b[j*ldb : j*ldb+k]
for l, v := range a[j*lda : j*lda+k] {
tmp1 += v * btmp[l]
tmp2 += atmp[l] * binner[l]
}
ctmp[jc] *= beta
ctmp[jc] += alpha * (tmp1 + tmp2)
}
}
}
return
}
for i := 0; i < n; i++ {
atmp := a[i*lda : i*lda+k]
btmp := b[i*ldb : i*ldb+k]
ctmp := c[i*ldc : i*ldc+i+1]
if beta == 0 {
for j := 0; j <= i; j++ {
var tmp1, tmp2 float32
binner := b[j*ldb : j*ldb+k]
for l, v := range a[j*lda : j*lda+k] {
tmp1 += v * btmp[l]
tmp2 += atmp[l] * binner[l]
}
ctmp[j] = alpha * (tmp1 + tmp2)
}
} else {
for j := 0; j <= i; j++ {
var tmp1, tmp2 float32
binner := b[j*ldb : j*ldb+k]
for l, v := range a[j*lda : j*lda+k] {
tmp1 += v * btmp[l]
tmp2 += atmp[l] * binner[l]
}
ctmp[j] *= beta
ctmp[j] += alpha * (tmp1 + tmp2)
}
}
}
return
}
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
switch beta {
case 0:
for j := range ctmp {
ctmp[j] = 0
}
case 1:
default:
for j := range ctmp {
ctmp[j] *= beta
}
}
for l := 0; l < k; l++ {
tmp1 := alpha * b[l*ldb+i]
tmp2 := alpha * a[l*lda+i]
btmp := b[l*ldb+i : l*ldb+n]
if tmp1 != 0 || tmp2 != 0 {
for j, v := range a[l*lda+i : l*lda+n] {
ctmp[j] += v*tmp1 + btmp[j]*tmp2
}
}
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
switch beta {
case 0:
for j := range ctmp {
ctmp[j] = 0
}
case 1:
default:
for j := range ctmp {
ctmp[j] *= beta
}
}
for l := 0; l < k; l++ {
tmp1 := alpha * b[l*ldb+i]
tmp2 := alpha * a[l*lda+i]
btmp := b[l*ldb : l*ldb+i+1]
if tmp1 != 0 || tmp2 != 0 {
for j, v := range a[l*lda : l*lda+i+1] {
ctmp[j] += v*tmp1 + btmp[j]*tmp2
}
}
}
}
}
// Strmm performs one of the matrix-matrix operations
//
// B = alpha * A * B if tA == blas.NoTrans and side == blas.Left
// B = alpha * Aᵀ * B if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
// B = alpha * B * A if tA == blas.NoTrans and side == blas.Right
// B = alpha * B * Aᵀ if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
//
// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Strmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
if s != blas.Left && s != blas.Right {
panic(badSide)
}
if ul != blas.Lower && ul != blas.Upper {
panic(badUplo)
}
if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
panic(badTranspose)
}
if d != blas.NonUnit && d != blas.Unit {
panic(badDiag)
}
if m < 0 {
panic(mLT0)
}
if n < 0 {
panic(nLT0)
}
k := n
if s == blas.Left {
k = m
}
if lda < max(1, k) {
panic(badLdA)
}
if ldb < max(1, n) {
panic(badLdB)
}
// Quick return if possible.
if m == 0 || n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if len(a) < lda*(k-1)+k {
panic(shortA)
}
if len(b) < ldb*(m-1)+n {
panic(shortB)
}
if alpha == 0 {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j := range btmp {
btmp[j] = 0
}
}
return
}
nonUnit := d == blas.NonUnit
if s == blas.Left {
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := 0; i < m; i++ {
tmp := alpha
if nonUnit {
tmp *= a[i*lda+i]
}
btmp := b[i*ldb : i*ldb+n]
f32.ScalUnitary(tmp, btmp)
for ka, va := range a[i*lda+i+1 : i*lda+m] {
k := ka + i + 1
if va != 0 {
f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
}
}
}
return
}
for i := m - 1; i >= 0; i-- {
tmp := alpha
if nonUnit {
tmp *= a[i*lda+i]
}
btmp := b[i*ldb : i*ldb+n]
f32.ScalUnitary(tmp, btmp)
for k, va := range a[i*lda : i*lda+i] {
if va != 0 {
f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
}
}
}
return
}
// Cases where a is transposed.
if ul == blas.Upper {
for k := m - 1; k >= 0; k-- {
btmpk := b[k*ldb : k*ldb+n]
for ia, va := range a[k*lda+k+1 : k*lda+m] {
i := ia + k + 1
btmp := b[i*ldb : i*ldb+n]
if va != 0 {
f32.AxpyUnitary(alpha*va, btmpk, btmp)
}
}
tmp := alpha
if nonUnit {
tmp *= a[k*lda+k]
}
if tmp != 1 {
f32.ScalUnitary(tmp, btmpk)
}
}
return
}
for k := 0; k < m; k++ {
btmpk := b[k*ldb : k*ldb+n]
for i, va := range a[k*lda : k*lda+k] {
btmp := b[i*ldb : i*ldb+n]
if va != 0 {
f32.AxpyUnitary(alpha*va, btmpk, btmp)
}
}
tmp := alpha
if nonUnit {
tmp *= a[k*lda+k]
}
if tmp != 1 {
f32.ScalUnitary(tmp, btmpk)
}
}
return
}
// Cases where a is on the right
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for k := n - 1; k >= 0; k-- {
tmp := alpha * btmp[k]
if tmp == 0 {
continue
}
btmp[k] = tmp
if nonUnit {
btmp[k] *= a[k*lda+k]
}
f32.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
}
}
return
}
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for k := 0; k < n; k++ {
tmp := alpha * btmp[k]
if tmp == 0 {
continue
}
btmp[k] = tmp
if nonUnit {
btmp[k] *= a[k*lda+k]
}
f32.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
}
}
return
}
// Cases where a is transposed.
if ul == blas.Upper {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j, vb := range btmp {
tmp := vb
if nonUnit {
tmp *= a[j*lda+j]
}
tmp += f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
btmp[j] = alpha * tmp
}
}
return
}
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j := n - 1; j >= 0; j-- {
tmp := btmp[j]
if nonUnit {
tmp *= a[j*lda+j]
}
tmp += f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
btmp[j] = alpha * tmp
}
}
}

913
vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go generated vendored Normal file
View File

@@ -0,0 +1,913 @@
// Copyright ©2014 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
"gonum.org/v1/gonum/blas"
"gonum.org/v1/gonum/internal/asm/f64"
)
var _ blas.Float64Level3 = Implementation{}
// Dtrsm solves one of the matrix equations
//
// A * X = alpha * B if tA == blas.NoTrans and side == blas.Left
// Aᵀ * X = alpha * B if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
// X * A = alpha * B if tA == blas.NoTrans and side == blas.Right
// X * Aᵀ = alpha * B if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
//
// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
// scalar.
//
// At entry to the function, X contains the values of B, and the result is
// stored in-place into X.
//
// No check is made that A is invertible.
func (Implementation) Dtrsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
if s != blas.Left && s != blas.Right {
panic(badSide)
}
if ul != blas.Lower && ul != blas.Upper {
panic(badUplo)
}
if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
panic(badTranspose)
}
if d != blas.NonUnit && d != blas.Unit {
panic(badDiag)
}
if m < 0 {
panic(mLT0)
}
if n < 0 {
panic(nLT0)
}
k := n
if s == blas.Left {
k = m
}
if lda < max(1, k) {
panic(badLdA)
}
if ldb < max(1, n) {
panic(badLdB)
}
// Quick return if possible.
if m == 0 || n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if len(a) < lda*(k-1)+k {
panic(shortA)
}
if len(b) < ldb*(m-1)+n {
panic(shortB)
}
if alpha == 0 {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j := range btmp {
btmp[j] = 0
}
}
return
}
nonUnit := d == blas.NonUnit
if s == blas.Left {
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := m - 1; i >= 0; i-- {
btmp := b[i*ldb : i*ldb+n]
if alpha != 1 {
f64.ScalUnitary(alpha, btmp)
}
for ka, va := range a[i*lda+i+1 : i*lda+m] {
if va != 0 {
k := ka + i + 1
f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
}
}
if nonUnit {
tmp := 1 / a[i*lda+i]
f64.ScalUnitary(tmp, btmp)
}
}
return
}
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
if alpha != 1 {
f64.ScalUnitary(alpha, btmp)
}
for k, va := range a[i*lda : i*lda+i] {
if va != 0 {
f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
}
}
if nonUnit {
tmp := 1 / a[i*lda+i]
f64.ScalUnitary(tmp, btmp)
}
}
return
}
// Cases where a is transposed
if ul == blas.Upper {
for k := 0; k < m; k++ {
btmpk := b[k*ldb : k*ldb+n]
if nonUnit {
tmp := 1 / a[k*lda+k]
f64.ScalUnitary(tmp, btmpk)
}
for ia, va := range a[k*lda+k+1 : k*lda+m] {
if va != 0 {
i := ia + k + 1
f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
}
}
if alpha != 1 {
f64.ScalUnitary(alpha, btmpk)
}
}
return
}
for k := m - 1; k >= 0; k-- {
btmpk := b[k*ldb : k*ldb+n]
if nonUnit {
tmp := 1 / a[k*lda+k]
f64.ScalUnitary(tmp, btmpk)
}
for i, va := range a[k*lda : k*lda+k] {
if va != 0 {
f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
}
}
if alpha != 1 {
f64.ScalUnitary(alpha, btmpk)
}
}
return
}
// Cases where a is to the right of X.
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
if alpha != 1 {
f64.ScalUnitary(alpha, btmp)
}
for k, vb := range btmp {
if vb == 0 {
continue
}
if nonUnit {
btmp[k] /= a[k*lda+k]
}
f64.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
}
}
return
}
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
if alpha != 1 {
f64.ScalUnitary(alpha, btmp)
}
for k := n - 1; k >= 0; k-- {
if btmp[k] == 0 {
continue
}
if nonUnit {
btmp[k] /= a[k*lda+k]
}
f64.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
}
}
return
}
// Cases where a is transposed.
if ul == blas.Upper {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j := n - 1; j >= 0; j-- {
tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
if nonUnit {
tmp /= a[j*lda+j]
}
btmp[j] = tmp
}
}
return
}
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j := 0; j < n; j++ {
tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
if nonUnit {
tmp /= a[j*lda+j]
}
btmp[j] = tmp
}
}
}
// Dsymm performs one of the matrix-matrix operations
//
// C = alpha * A * B + beta * C if side == blas.Left
// C = alpha * B * A + beta * C if side == blas.Right
//
// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
// is a scalar.
func (Implementation) Dsymm(s blas.Side, ul blas.Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
if s != blas.Right && s != blas.Left {
panic(badSide)
}
if ul != blas.Lower && ul != blas.Upper {
panic(badUplo)
}
if m < 0 {
panic(mLT0)
}
if n < 0 {
panic(nLT0)
}
k := n
if s == blas.Left {
k = m
}
if lda < max(1, k) {
panic(badLdA)
}
if ldb < max(1, n) {
panic(badLdB)
}
if ldc < max(1, n) {
panic(badLdC)
}
// Quick return if possible.
if m == 0 || n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if len(a) < lda*(k-1)+k {
panic(shortA)
}
if len(b) < ldb*(m-1)+n {
panic(shortB)
}
if len(c) < ldc*(m-1)+n {
panic(shortC)
}
// Quick return if possible.
if alpha == 0 && beta == 1 {
return
}
if beta == 0 {
for i := 0; i < m; i++ {
ctmp := c[i*ldc : i*ldc+n]
for j := range ctmp {
ctmp[j] = 0
}
}
}
if alpha == 0 {
if beta != 0 {
for i := 0; i < m; i++ {
ctmp := c[i*ldc : i*ldc+n]
for j := 0; j < n; j++ {
ctmp[j] *= beta
}
}
}
return
}
isUpper := ul == blas.Upper
if s == blas.Left {
for i := 0; i < m; i++ {
atmp := alpha * a[i*lda+i]
btmp := b[i*ldb : i*ldb+n]
ctmp := c[i*ldc : i*ldc+n]
for j, v := range btmp {
ctmp[j] *= beta
ctmp[j] += atmp * v
}
for k := 0; k < i; k++ {
var atmp float64
if isUpper {
atmp = a[k*lda+i]
} else {
atmp = a[i*lda+k]
}
atmp *= alpha
f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
}
for k := i + 1; k < m; k++ {
var atmp float64
if isUpper {
atmp = a[i*lda+k]
} else {
atmp = a[k*lda+i]
}
atmp *= alpha
f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
}
}
return
}
if isUpper {
for i := 0; i < m; i++ {
for j := n - 1; j >= 0; j-- {
tmp := alpha * b[i*ldb+j]
var tmp2 float64
atmp := a[j*lda+j+1 : j*lda+n]
btmp := b[i*ldb+j+1 : i*ldb+n]
ctmp := c[i*ldc+j+1 : i*ldc+n]
for k, v := range atmp {
ctmp[k] += tmp * v
tmp2 += btmp[k] * v
}
c[i*ldc+j] *= beta
c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
}
}
return
}
for i := 0; i < m; i++ {
for j := 0; j < n; j++ {
tmp := alpha * b[i*ldb+j]
var tmp2 float64
atmp := a[j*lda : j*lda+j]
btmp := b[i*ldb : i*ldb+j]
ctmp := c[i*ldc : i*ldc+j]
for k, v := range atmp {
ctmp[k] += tmp * v
tmp2 += btmp[k] * v
}
c[i*ldc+j] *= beta
c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
}
}
}
// Dsyrk performs one of the symmetric rank-k operations
//
// C = alpha * A * Aᵀ + beta * C if tA == blas.NoTrans
// C = alpha * Aᵀ * A + beta * C if tA == blas.Trans or tA == blas.ConjTrans
//
// where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
// beta are scalars.
func (Implementation) Dsyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int) {
if ul != blas.Lower && ul != blas.Upper {
panic(badUplo)
}
if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
panic(badTranspose)
}
if n < 0 {
panic(nLT0)
}
if k < 0 {
panic(kLT0)
}
row, col := k, n
if tA == blas.NoTrans {
row, col = n, k
}
if lda < max(1, col) {
panic(badLdA)
}
if ldc < max(1, n) {
panic(badLdC)
}
// Quick return if possible.
if n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if len(a) < lda*(row-1)+col {
panic(shortA)
}
if len(c) < ldc*(n-1)+n {
panic(shortC)
}
if alpha == 0 {
if beta == 0 {
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
for j := range ctmp {
ctmp[j] = 0
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
for j := range ctmp {
ctmp[j] = 0
}
}
return
}
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
for j := range ctmp {
ctmp[j] *= beta
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
for j := range ctmp {
ctmp[j] *= beta
}
}
return
}
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
atmp := a[i*lda : i*lda+k]
if beta == 0 {
for jc := range ctmp {
j := jc + i
ctmp[jc] = alpha * f64.DotUnitary(atmp, a[j*lda:j*lda+k])
}
} else {
for jc, vc := range ctmp {
j := jc + i
ctmp[jc] = vc*beta + alpha*f64.DotUnitary(atmp, a[j*lda:j*lda+k])
}
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
atmp := a[i*lda : i*lda+k]
if beta == 0 {
for j := range ctmp {
ctmp[j] = alpha * f64.DotUnitary(a[j*lda:j*lda+k], atmp)
}
} else {
for j, vc := range ctmp {
ctmp[j] = vc*beta + alpha*f64.DotUnitary(a[j*lda:j*lda+k], atmp)
}
}
}
return
}
// Cases where a is transposed.
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
if beta == 0 {
for j := range ctmp {
ctmp[j] = 0
}
} else if beta != 1 {
for j := range ctmp {
ctmp[j] *= beta
}
}
for l := 0; l < k; l++ {
tmp := alpha * a[l*lda+i]
if tmp != 0 {
f64.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
}
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
if beta != 1 {
for j := range ctmp {
ctmp[j] *= beta
}
}
for l := 0; l < k; l++ {
tmp := alpha * a[l*lda+i]
if tmp != 0 {
f64.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
}
}
}
}
// Dsyr2k performs one of the symmetric rank 2k operations
//
// C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C if tA == blas.NoTrans
// C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C if tA == blas.Trans or tA == blas.ConjTrans
//
// where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
// alpha and beta are scalars.
func (Implementation) Dsyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
if ul != blas.Lower && ul != blas.Upper {
panic(badUplo)
}
if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
panic(badTranspose)
}
if n < 0 {
panic(nLT0)
}
if k < 0 {
panic(kLT0)
}
row, col := k, n
if tA == blas.NoTrans {
row, col = n, k
}
if lda < max(1, col) {
panic(badLdA)
}
if ldb < max(1, col) {
panic(badLdB)
}
if ldc < max(1, n) {
panic(badLdC)
}
// Quick return if possible.
if n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if len(a) < lda*(row-1)+col {
panic(shortA)
}
if len(b) < ldb*(row-1)+col {
panic(shortB)
}
if len(c) < ldc*(n-1)+n {
panic(shortC)
}
if alpha == 0 {
if beta == 0 {
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
for j := range ctmp {
ctmp[j] = 0
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
for j := range ctmp {
ctmp[j] = 0
}
}
return
}
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
for j := range ctmp {
ctmp[j] *= beta
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
for j := range ctmp {
ctmp[j] *= beta
}
}
return
}
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := 0; i < n; i++ {
atmp := a[i*lda : i*lda+k]
btmp := b[i*ldb : i*ldb+k]
ctmp := c[i*ldc+i : i*ldc+n]
if beta == 0 {
for jc := range ctmp {
j := i + jc
var tmp1, tmp2 float64
binner := b[j*ldb : j*ldb+k]
for l, v := range a[j*lda : j*lda+k] {
tmp1 += v * btmp[l]
tmp2 += atmp[l] * binner[l]
}
ctmp[jc] = alpha * (tmp1 + tmp2)
}
} else {
for jc := range ctmp {
j := i + jc
var tmp1, tmp2 float64
binner := b[j*ldb : j*ldb+k]
for l, v := range a[j*lda : j*lda+k] {
tmp1 += v * btmp[l]
tmp2 += atmp[l] * binner[l]
}
ctmp[jc] *= beta
ctmp[jc] += alpha * (tmp1 + tmp2)
}
}
}
return
}
for i := 0; i < n; i++ {
atmp := a[i*lda : i*lda+k]
btmp := b[i*ldb : i*ldb+k]
ctmp := c[i*ldc : i*ldc+i+1]
if beta == 0 {
for j := 0; j <= i; j++ {
var tmp1, tmp2 float64
binner := b[j*ldb : j*ldb+k]
for l, v := range a[j*lda : j*lda+k] {
tmp1 += v * btmp[l]
tmp2 += atmp[l] * binner[l]
}
ctmp[j] = alpha * (tmp1 + tmp2)
}
} else {
for j := 0; j <= i; j++ {
var tmp1, tmp2 float64
binner := b[j*ldb : j*ldb+k]
for l, v := range a[j*lda : j*lda+k] {
tmp1 += v * btmp[l]
tmp2 += atmp[l] * binner[l]
}
ctmp[j] *= beta
ctmp[j] += alpha * (tmp1 + tmp2)
}
}
}
return
}
if ul == blas.Upper {
for i := 0; i < n; i++ {
ctmp := c[i*ldc+i : i*ldc+n]
switch beta {
case 0:
for j := range ctmp {
ctmp[j] = 0
}
case 1:
default:
for j := range ctmp {
ctmp[j] *= beta
}
}
for l := 0; l < k; l++ {
tmp1 := alpha * b[l*ldb+i]
tmp2 := alpha * a[l*lda+i]
btmp := b[l*ldb+i : l*ldb+n]
if tmp1 != 0 || tmp2 != 0 {
for j, v := range a[l*lda+i : l*lda+n] {
ctmp[j] += v*tmp1 + btmp[j]*tmp2
}
}
}
}
return
}
for i := 0; i < n; i++ {
ctmp := c[i*ldc : i*ldc+i+1]
switch beta {
case 0:
for j := range ctmp {
ctmp[j] = 0
}
case 1:
default:
for j := range ctmp {
ctmp[j] *= beta
}
}
for l := 0; l < k; l++ {
tmp1 := alpha * b[l*ldb+i]
tmp2 := alpha * a[l*lda+i]
btmp := b[l*ldb : l*ldb+i+1]
if tmp1 != 0 || tmp2 != 0 {
for j, v := range a[l*lda : l*lda+i+1] {
ctmp[j] += v*tmp1 + btmp[j]*tmp2
}
}
}
}
}
// Dtrmm performs one of the matrix-matrix operations
//
// B = alpha * A * B if tA == blas.NoTrans and side == blas.Left
// B = alpha * Aᵀ * B if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
// B = alpha * B * A if tA == blas.NoTrans and side == blas.Right
// B = alpha * B * Aᵀ if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
//
// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
func (Implementation) Dtrmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
if s != blas.Left && s != blas.Right {
panic(badSide)
}
if ul != blas.Lower && ul != blas.Upper {
panic(badUplo)
}
if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
panic(badTranspose)
}
if d != blas.NonUnit && d != blas.Unit {
panic(badDiag)
}
if m < 0 {
panic(mLT0)
}
if n < 0 {
panic(nLT0)
}
k := n
if s == blas.Left {
k = m
}
if lda < max(1, k) {
panic(badLdA)
}
if ldb < max(1, n) {
panic(badLdB)
}
// Quick return if possible.
if m == 0 || n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if len(a) < lda*(k-1)+k {
panic(shortA)
}
if len(b) < ldb*(m-1)+n {
panic(shortB)
}
if alpha == 0 {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j := range btmp {
btmp[j] = 0
}
}
return
}
nonUnit := d == blas.NonUnit
if s == blas.Left {
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := 0; i < m; i++ {
tmp := alpha
if nonUnit {
tmp *= a[i*lda+i]
}
btmp := b[i*ldb : i*ldb+n]
f64.ScalUnitary(tmp, btmp)
for ka, va := range a[i*lda+i+1 : i*lda+m] {
k := ka + i + 1
if va != 0 {
f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
}
}
}
return
}
for i := m - 1; i >= 0; i-- {
tmp := alpha
if nonUnit {
tmp *= a[i*lda+i]
}
btmp := b[i*ldb : i*ldb+n]
f64.ScalUnitary(tmp, btmp)
for k, va := range a[i*lda : i*lda+i] {
if va != 0 {
f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
}
}
}
return
}
// Cases where a is transposed.
if ul == blas.Upper {
for k := m - 1; k >= 0; k-- {
btmpk := b[k*ldb : k*ldb+n]
for ia, va := range a[k*lda+k+1 : k*lda+m] {
i := ia + k + 1
btmp := b[i*ldb : i*ldb+n]
if va != 0 {
f64.AxpyUnitary(alpha*va, btmpk, btmp)
}
}
tmp := alpha
if nonUnit {
tmp *= a[k*lda+k]
}
if tmp != 1 {
f64.ScalUnitary(tmp, btmpk)
}
}
return
}
for k := 0; k < m; k++ {
btmpk := b[k*ldb : k*ldb+n]
for i, va := range a[k*lda : k*lda+k] {
btmp := b[i*ldb : i*ldb+n]
if va != 0 {
f64.AxpyUnitary(alpha*va, btmpk, btmp)
}
}
tmp := alpha
if nonUnit {
tmp *= a[k*lda+k]
}
if tmp != 1 {
f64.ScalUnitary(tmp, btmpk)
}
}
return
}
// Cases where a is on the right
if tA == blas.NoTrans {
if ul == blas.Upper {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for k := n - 1; k >= 0; k-- {
tmp := alpha * btmp[k]
if tmp == 0 {
continue
}
btmp[k] = tmp
if nonUnit {
btmp[k] *= a[k*lda+k]
}
f64.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
}
}
return
}
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for k := 0; k < n; k++ {
tmp := alpha * btmp[k]
if tmp == 0 {
continue
}
btmp[k] = tmp
if nonUnit {
btmp[k] *= a[k*lda+k]
}
f64.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
}
}
return
}
// Cases where a is transposed.
if ul == blas.Upper {
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j, vb := range btmp {
tmp := vb
if nonUnit {
tmp *= a[j*lda+j]
}
tmp += f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
btmp[j] = alpha * tmp
}
}
return
}
for i := 0; i < m; i++ {
btmp := b[i*ldb : i*ldb+n]
for j := n - 1; j >= 0; j-- {
tmp := btmp[j]
if nonUnit {
tmp *= a[j*lda+j]
}
tmp += f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
btmp[j] = alpha * tmp
}
}
}

301
vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go generated vendored Normal file
View File

@@ -0,0 +1,301 @@
// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
// Copyright ©2014 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package gonum
import (
"runtime"
"sync"
"gonum.org/v1/gonum/blas"
"gonum.org/v1/gonum/internal/asm/f32"
)
// Sgemm performs one of the matrix-matrix operations
//
// C = alpha * A * B + beta * C
// C = alpha * Aᵀ * B + beta * C
// C = alpha * A * Bᵀ + beta * C
// C = alpha * Aᵀ * Bᵀ + beta * C
//
// where A is an m×k or k×m dense matrix, B is an n×k or k×n dense matrix, C is
// an m×n matrix, and alpha and beta are scalars. tA and tB specify whether A or
// B are transposed.
//
// Float32 implementations are autogenerated and not directly tested.
func (Implementation) Sgemm(tA, tB blas.Transpose, m, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
switch tA {
default:
panic(badTranspose)
case blas.NoTrans, blas.Trans, blas.ConjTrans:
}
switch tB {
default:
panic(badTranspose)
case blas.NoTrans, blas.Trans, blas.ConjTrans:
}
if m < 0 {
panic(mLT0)
}
if n < 0 {
panic(nLT0)
}
if k < 0 {
panic(kLT0)
}
aTrans := tA == blas.Trans || tA == blas.ConjTrans
if aTrans {
if lda < max(1, m) {
panic(badLdA)
}
} else {
if lda < max(1, k) {
panic(badLdA)
}
}
bTrans := tB == blas.Trans || tB == blas.ConjTrans
if bTrans {
if ldb < max(1, k) {
panic(badLdB)
}
} else {
if ldb < max(1, n) {
panic(badLdB)
}
}
if ldc < max(1, n) {
panic(badLdC)
}
// Quick return if possible.
if m == 0 || n == 0 {
return
}
// For zero matrix size the following slice length checks are trivially satisfied.
if aTrans {
if len(a) < (k-1)*lda+m {
panic(shortA)
}
} else {
if len(a) < (m-1)*lda+k {
panic(shortA)
}
}
if bTrans {
if len(b) < (n-1)*ldb+k {
panic(shortB)
}
} else {
if len(b) < (k-1)*ldb+n {
panic(shortB)
}
}
if len(c) < (m-1)*ldc+n {
panic(shortC)
}
// Quick return if possible.
if (alpha == 0 || k == 0) && beta == 1 {
return
}
// scale c
if beta != 1 {
if beta == 0 {
for i := 0; i < m; i++ {
ctmp := c[i*ldc : i*ldc+n]
for j := range ctmp {
ctmp[j] = 0
}
}
} else {
for i := 0; i < m; i++ {
ctmp := c[i*ldc : i*ldc+n]
for j := range ctmp {
ctmp[j] *= beta
}
}
}
}
sgemmParallel(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
}
func sgemmParallel(aTrans, bTrans bool, m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
// dgemmParallel computes a parallel matrix multiplication by partitioning
// a and b into sub-blocks, and updating c with the multiplication of the sub-block
// In all cases,
// A = [ A_11 A_12 ... A_1j
// A_21 A_22 ... A_2j
// ...
// A_i1 A_i2 ... A_ij]
//
// and same for B. All of the submatrix sizes are blockSize×blockSize except
// at the edges.
//
// In all cases, there is one dimension for each matrix along which
// C must be updated sequentially.
// Cij = \sum_k Aik Bki, (A * B)
// Cij = \sum_k Aki Bkj, (Aᵀ * B)
// Cij = \sum_k Aik Bjk, (A * Bᵀ)
// Cij = \sum_k Aki Bjk, (Aᵀ * Bᵀ)
//
// This code computes one {i, j} block sequentially along the k dimension,
// and computes all of the {i, j} blocks concurrently. This
// partitioning allows Cij to be updated in-place without race-conditions.
// Instead of launching a goroutine for each possible concurrent computation,
// a number of worker goroutines are created and channels are used to pass
// available and completed cases.
//
// http://alexkr.com/docs/matrixmult.pdf is a good reference on matrix-matrix
// multiplies, though this code does not copy matrices to attempt to eliminate
// cache misses.
maxKLen := k
parBlocks := blocks(m, blockSize) * blocks(n, blockSize)
if parBlocks < minParBlock {
// The matrix multiplication is small in the dimensions where it can be
// computed concurrently. Just do it in serial.
sgemmSerial(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
return
}
// workerLimit acts a number of maximum concurrent workers,
// with the limit set to the number of procs available.
workerLimit := make(chan struct{}, runtime.GOMAXPROCS(0))
// wg is used to wait for all
var wg sync.WaitGroup
wg.Add(parBlocks)
defer wg.Wait()
for i := 0; i < m; i += blockSize {
for j := 0; j < n; j += blockSize {
workerLimit <- struct{}{}
go func(i, j int) {
defer func() {
wg.Done()
<-workerLimit
}()
leni := blockSize
if i+leni > m {
leni = m - i
}
lenj := blockSize
if j+lenj > n {
lenj = n - j
}
cSub := sliceView32(c, ldc, i, j, leni, lenj)
// Compute A_ik B_kj for all k
for k := 0; k < maxKLen; k += blockSize {
lenk := blockSize
if k+lenk > maxKLen {
lenk = maxKLen - k
}
var aSub, bSub []float32
if aTrans {
aSub = sliceView32(a, lda, k, i, lenk, leni)
} else {
aSub = sliceView32(a, lda, i, k, leni, lenk)
}
if bTrans {
bSub = sliceView32(b, ldb, j, k, lenj, lenk)
} else {
bSub = sliceView32(b, ldb, k, j, lenk, lenj)
}
sgemmSerial(aTrans, bTrans, leni, lenj, lenk, aSub, lda, bSub, ldb, cSub, ldc, alpha)
}
}(i, j)
}
}
}
// sgemmSerial is serial matrix multiply
func sgemmSerial(aTrans, bTrans bool, m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
switch {
case !aTrans && !bTrans:
sgemmSerialNotNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
return
case aTrans && !bTrans:
sgemmSerialTransNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
return
case !aTrans && bTrans:
sgemmSerialNotTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
return
case aTrans && bTrans:
sgemmSerialTransTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
return
default:
panic("unreachable")
}
}
// sgemmSerial where neither a nor b are transposed
func sgemmSerialNotNot(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
// This style is used instead of the literal [i*stride +j]) is used because
// approximately 5 times faster as of go 1.3.
for i := 0; i < m; i++ {
ctmp := c[i*ldc : i*ldc+n]
for l, v := range a[i*lda : i*lda+k] {
tmp := alpha * v
if tmp != 0 {
f32.AxpyUnitary(tmp, b[l*ldb:l*ldb+n], ctmp)
}
}
}
}
// sgemmSerial where neither a is transposed and b is not
func sgemmSerialTransNot(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
// This style is used instead of the literal [i*stride +j]) is used because
// approximately 5 times faster as of go 1.3.
for l := 0; l < k; l++ {
btmp := b[l*ldb : l*ldb+n]
for i, v := range a[l*lda : l*lda+m] {
tmp := alpha * v
if tmp != 0 {
ctmp := c[i*ldc : i*ldc+n]
f32.AxpyUnitary(tmp, btmp, ctmp)
}
}
}
}
// sgemmSerial where neither a is not transposed and b is
func sgemmSerialNotTrans(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
// This style is used instead of the literal [i*stride +j]) is used because
// approximately 5 times faster as of go 1.3.
for i := 0; i < m; i++ {
atmp := a[i*lda : i*lda+k]
ctmp := c[i*ldc : i*ldc+n]
for j := 0; j < n; j++ {
ctmp[j] += alpha * f32.DotUnitary(atmp, b[j*ldb:j*ldb+k])
}
}
}
// sgemmSerial where both are transposed
func sgemmSerialTransTrans(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
// This style is used instead of the literal [i*stride +j]) is used because
// approximately 5 times faster as of go 1.3.
for l := 0; l < k; l++ {
for i, v := range a[l*lda : l*lda+m] {
tmp := alpha * v
if tmp != 0 {
ctmp := c[i*ldc : i*ldc+n]
f32.AxpyInc(tmp, b[l:], ctmp, uintptr(n), uintptr(ldb), 1, 0, 0)
}
}
}
}
func sliceView32(a []float32, lda, i, j, r, c int) []float32 {
return a[i*lda+j : (i+r-1)*lda+j+c]
}

View File

@@ -0,0 +1,224 @@
#!/usr/bin/env bash
# Copyright ©2015 The Gonum Authors. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
WARNINGF32='//\
// Float32 implementations are autogenerated and not directly tested.\
'
WARNINGC64='//\
// Complex64 implementations are autogenerated and not directly tested.\
'
# Level1 routines.
echo Generating level1float32.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32.go
cat level1float64.go \
| gofmt -r 'blas.Float64Level1 -> blas.Float32Level1' \
\
| gofmt -r 'float64 -> float32' \
| gofmt -r 'blas.DrotmParams -> blas.SrotmParams' \
\
| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
| gofmt -r 'f64.L2NormInc -> f32.L2NormInc' \
| gofmt -r 'f64.L2NormUnitary -> f32.L2NormUnitary' \
| gofmt -r 'f64.ScalInc -> f32.ScalInc' \
| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
\
| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
-e 's_^// D_// S_' \
-e "s_^\(func (Implementation) \)Id\(.*\)\$_$WARNINGF32\1Is\2_" \
-e 's_^// Id_// Is_' \
-e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
-e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
-e 's_safmin = 0x1p-1022_safmin = 0x1p-126_' \
>> level1float32.go
echo Generating level1cmplx64.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1cmplx64.go
cat level1cmplx128.go \
| gofmt -r 'blas.Complex128Level1 -> blas.Complex64Level1' \
\
| gofmt -r 'float64 -> float32' \
| gofmt -r 'complex128 -> complex64' \
\
| gofmt -r 'c128.AxpyInc -> c64.AxpyInc' \
| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
| gofmt -r 'c128.DotcInc -> c64.DotcInc' \
| gofmt -r 'c128.DotcUnitary -> c64.DotcUnitary' \
| gofmt -r 'c128.DotuInc -> c64.DotuInc' \
| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
| gofmt -r 'c128.ScalInc -> c64.ScalInc' \
| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
| gofmt -r 'dcabs1 -> scabs1' \
\
| sed -e "s_^\(func (Implementation) \)Zdot\(.*\)\$_$WARNINGC64\1Cdot\2_" \
-e 's_^// Zdot_// Cdot_' \
-e "s_^\(func (Implementation) \)Zdscal\(.*\)\$_$WARNINGC64\1Csscal\2_" \
-e 's_^// Zdscal_// Csscal_' \
-e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
-e 's_^// Z_// C_' \
-e "s_^\(func (Implementation) \)Iz\(.*\)\$_$WARNINGC64\1Ic\2_" \
-e 's_^// Iz_// Ic_' \
-e "s_^\(func (Implementation) \)Dz\(.*\)\$_$WARNINGC64\1Sc\2_" \
-e 's_^// Dz_// Sc_' \
-e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
-e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
>> level1cmplx64.go
echo Generating level1float32_sdot.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_sdot.go
cat level1float64_ddot.go \
| gofmt -r 'float64 -> float32' \
\
| gofmt -r 'f64.DotInc -> f32.DotInc' \
| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
\
| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
-e 's_^// D_// S_' \
-e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
>> level1float32_sdot.go
echo Generating level1float32_dsdot.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_dsdot.go
cat level1float64_ddot.go \
| gofmt -r '[]float64 -> []float32' \
\
| gofmt -r 'f64.DotInc -> f32.DdotInc' \
| gofmt -r 'f64.DotUnitary -> f32.DdotUnitary' \
\
| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1Ds\2_" \
-e 's_^// D_// Ds_' \
-e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
>> level1float32_dsdot.go
echo Generating level1float32_sdsdot.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_sdsdot.go
cat level1float64_ddot.go \
| gofmt -r 'float64 -> float32' \
\
| gofmt -r 'f64.DotInc(x, y, f(n), f(incX), f(incY), f(ix), f(iy)) -> alpha + float32(f32.DdotInc(x, y, f(n), f(incX), f(incY), f(ix), f(iy)))' \
| gofmt -r 'f64.DotUnitary(a, b) -> alpha + float32(f32.DdotUnitary(a, b))' \
\
| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1Sds\2_" \
-e 's_^// D\(.*\)$_// Sds\1 plus a constant_' \
-e 's_\\sum_alpha + \\sum_' \
-e 's/n int/n int, alpha float32/' \
-e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
>> level1float32_sdsdot.go
# Level2 routines.
echo Generating level2float32.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level2float32.go
cat level2float64.go \
| gofmt -r 'blas.Float64Level2 -> blas.Float32Level2' \
\
| gofmt -r 'float64 -> float32' \
\
| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
| gofmt -r 'f64.AxpyIncTo -> f32.AxpyIncTo' \
| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
| gofmt -r 'f64.AxpyUnitaryTo -> f32.AxpyUnitaryTo' \
| gofmt -r 'f64.DotInc -> f32.DotInc' \
| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
| gofmt -r 'f64.ScalInc -> f32.ScalInc' \
| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
| gofmt -r 'f64.Ger -> f32.Ger' \
| gofmt -r 'f64.GemvN -> f32.GemvN' \
| gofmt -r 'f64.GemvT -> f32.GemvT' \
| gofmt -r 'Implementation{}.Dscal -> Implementation{}.Sscal' \
\
| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
-e 's_^// D_// S_' \
-e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
>> level2float32.go
echo Generating level2cmplx64.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level2cmplx64.go
cat level2cmplx128.go \
| gofmt -r 'blas.Complex128Level2 -> blas.Complex64Level2' \
\
| gofmt -r 'complex128 -> complex64' \
| gofmt -r 'float64 -> float32' \
\
| gofmt -r 'c128.AxpyInc -> c64.AxpyInc' \
| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
| gofmt -r 'c128.DotuInc -> c64.DotuInc' \
| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
| gofmt -r 'c128.ScalInc -> c64.ScalInc' \
| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
\
| sed -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
-e 's_^// Z_// C_' \
-e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
-e 's_"math/cmplx"_cmplx "gonum.org/v1/gonum/internal/cmplx64"_' \
>> level2cmplx64.go
# Level3 routines.
echo Generating level3float32.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level3float32.go
cat level3float64.go \
| gofmt -r 'blas.Float64Level3 -> blas.Float32Level3' \
\
| gofmt -r 'float64 -> float32' \
\
| gofmt -r 'f64.AxpyUnitaryTo -> f32.AxpyUnitaryTo' \
| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
\
| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
-e 's_^// D_// S_' \
-e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
>> level3float32.go
echo Generating sgemm.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > sgemm.go
cat dgemm.go \
| gofmt -r 'float64 -> float32' \
| gofmt -r 'sliceView64 -> sliceView32' \
\
| gofmt -r 'dgemmParallel -> sgemmParallel' \
| gofmt -r 'computeNumBlocks64 -> computeNumBlocks32' \
| gofmt -r 'dgemmSerial -> sgemmSerial' \
| gofmt -r 'dgemmSerialNotNot -> sgemmSerialNotNot' \
| gofmt -r 'dgemmSerialTransNot -> sgemmSerialTransNot' \
| gofmt -r 'dgemmSerialNotTrans -> sgemmSerialNotTrans' \
| gofmt -r 'dgemmSerialTransTrans -> sgemmSerialTransTrans' \
\
| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
\
| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
-e 's_^// D_// S_' \
-e 's_^// d_// s_' \
-e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
>> sgemm.go
echo Generating level3cmplx64.go
echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level3cmplx64.go
cat level3cmplx128.go \
| gofmt -r 'blas.Complex128Level3 -> blas.Complex64Level3' \
\
| gofmt -r 'float64 -> float32' \
| gofmt -r 'complex128 -> complex64' \
\
| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
| gofmt -r 'c128.DscalUnitary -> c64.SscalUnitary' \
| gofmt -r 'c128.DotcUnitary -> c64.DotcUnitary' \
| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
\
| sed -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
-e 's_^// Z_// C_' \
-e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
-e 's_"math/cmplx"_cmplx "gonum.org/v1/gonum/internal/cmplx64"_' \
>> level3cmplx64.go

7
vendor/gonum.org/v1/gonum/floats/README.md generated vendored Normal file
View File

@@ -0,0 +1,7 @@
# Gonum floats
[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/floats)](https://pkg.go.dev/gonum.org/v1/gonum/floats)
[![GoDoc](https://godocs.io/gonum.org/v1/gonum/floats?status.svg)](https://godocs.io/gonum.org/v1/gonum/floats)
Package floats provides a set of helper routines for dealing with slices of float64.
The functions avoid allocations to allow for use within tight loops without garbage collection overhead.

11
vendor/gonum.org/v1/gonum/floats/doc.go generated vendored Normal file
View File

@@ -0,0 +1,11 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package floats provides a set of helper routines for dealing with slices
// of float64. The functions avoid allocations to allow for use within tight
// loops without garbage collection overhead.
//
// The convention used is that when a slice is being modified in place, it has
// the name dst.
package floats // import "gonum.org/v1/gonum/floats"

807
vendor/gonum.org/v1/gonum/floats/floats.go generated vendored Normal file
View File

@@ -0,0 +1,807 @@
// Copyright ©2013 The Gonum Authors. All rights reserved.
// Use of this code is governed by a BSD-style
// license that can be found in the LICENSE file.
package floats
import (
"errors"
"math"
"sort"
"gonum.org/v1/gonum/floats/scalar"
"gonum.org/v1/gonum/internal/asm/f64"
)
const (
zeroLength = "floats: zero length slice"
shortSpan = "floats: slice length less than 2"
badLength = "floats: slice lengths do not match"
badDstLength = "floats: destination slice length does not match input"
)
// Add adds, element-wise, the elements of s and dst, and stores the result in dst.
// It panics if the argument lengths do not match.
func Add(dst, s []float64) {
if len(dst) != len(s) {
panic(badDstLength)
}
f64.AxpyUnitaryTo(dst, 1, s, dst)
}
// AddTo adds, element-wise, the elements of s and t and
// stores the result in dst.
// It panics if the argument lengths do not match.
func AddTo(dst, s, t []float64) []float64 {
if len(s) != len(t) {
panic(badLength)
}
if len(dst) != len(s) {
panic(badDstLength)
}
f64.AxpyUnitaryTo(dst, 1, s, t)
return dst
}
// AddConst adds the scalar c to all of the values in dst.
func AddConst(c float64, dst []float64) {
f64.AddConst(c, dst)
}
// AddScaled performs dst = dst + alpha * s.
// It panics if the slice argument lengths do not match.
func AddScaled(dst []float64, alpha float64, s []float64) {
if len(dst) != len(s) {
panic(badLength)
}
f64.AxpyUnitaryTo(dst, alpha, s, dst)
}
// AddScaledTo performs dst = y + alpha * s, where alpha is a scalar,
// and dst, y and s are all slices.
// It panics if the slice argument lengths do not match.
//
// At the return of the function, dst[i] = y[i] + alpha * s[i]
func AddScaledTo(dst, y []float64, alpha float64, s []float64) []float64 {
if len(s) != len(y) {
panic(badLength)
}
if len(dst) != len(y) {
panic(badDstLength)
}
f64.AxpyUnitaryTo(dst, alpha, s, y)
return dst
}
// argsort is a helper that implements sort.Interface, as used by
// Argsort and ArgsortStable.
type argsort struct {
s []float64
inds []int
}
func (a argsort) Len() int {
return len(a.s)
}
func (a argsort) Less(i, j int) bool {
return a.s[i] < a.s[j]
}
func (a argsort) Swap(i, j int) {
a.s[i], a.s[j] = a.s[j], a.s[i]
a.inds[i], a.inds[j] = a.inds[j], a.inds[i]
}
// Argsort sorts the elements of dst while tracking their original order.
// At the conclusion of Argsort, dst will contain the original elements of dst
// but sorted in increasing order, and inds will contain the original position
// of the elements in the slice such that dst[i] = origDst[inds[i]].
// It panics if the argument lengths do not match.
func Argsort(dst []float64, inds []int) {
if len(dst) != len(inds) {
panic(badDstLength)
}
for i := range dst {
inds[i] = i
}
a := argsort{s: dst, inds: inds}
sort.Sort(a)
}
// ArgsortStable sorts the elements of dst while tracking their original order and
// keeping the original order of equal elements. At the conclusion of ArgsortStable,
// dst will contain the original elements of dst but sorted in increasing order,
// and inds will contain the original position of the elements in the slice such
// that dst[i] = origDst[inds[i]].
// It panics if the argument lengths do not match.
func ArgsortStable(dst []float64, inds []int) {
if len(dst) != len(inds) {
panic(badDstLength)
}
for i := range dst {
inds[i] = i
}
a := argsort{s: dst, inds: inds}
sort.Stable(a)
}
// Count applies the function f to every element of s and returns the number
// of times the function returned true.
func Count(f func(float64) bool, s []float64) int {
var n int
for _, val := range s {
if f(val) {
n++
}
}
return n
}
// CumProd finds the cumulative product of the first i elements in
// s and puts them in place into the ith element of the
// destination dst.
// It panics if the argument lengths do not match.
//
// At the return of the function, dst[i] = s[i] * s[i-1] * s[i-2] * ...
func CumProd(dst, s []float64) []float64 {
if len(dst) != len(s) {
panic(badDstLength)
}
if len(dst) == 0 {
return dst
}
return f64.CumProd(dst, s)
}
// CumSum finds the cumulative sum of the first i elements in
// s and puts them in place into the ith element of the
// destination dst.
// It panics if the argument lengths do not match.
//
// At the return of the function, dst[i] = s[i] + s[i-1] + s[i-2] + ...
func CumSum(dst, s []float64) []float64 {
if len(dst) != len(s) {
panic(badDstLength)
}
if len(dst) == 0 {
return dst
}
return f64.CumSum(dst, s)
}
// Distance computes the L-norm of s - t. See Norm for special cases.
// It panics if the slice argument lengths do not match.
func Distance(s, t []float64, L float64) float64 {
if len(s) != len(t) {
panic(badLength)
}
if len(s) == 0 {
return 0
}
if L == 2 {
return f64.L2DistanceUnitary(s, t)
}
var norm float64
if L == 1 {
for i, v := range s {
norm += math.Abs(t[i] - v)
}
return norm
}
if math.IsInf(L, 1) {
for i, v := range s {
absDiff := math.Abs(t[i] - v)
if absDiff > norm {
norm = absDiff
}
}
return norm
}
for i, v := range s {
norm += math.Pow(math.Abs(t[i]-v), L)
}
return math.Pow(norm, 1/L)
}
// Div performs element-wise division dst / s
// and stores the value in dst.
// It panics if the argument lengths do not match.
func Div(dst, s []float64) {
if len(dst) != len(s) {
panic(badLength)
}
f64.Div(dst, s)
}
// DivTo performs element-wise division s / t
// and stores the value in dst.
// It panics if the argument lengths do not match.
func DivTo(dst, s, t []float64) []float64 {
if len(s) != len(t) {
panic(badLength)
}
if len(dst) != len(s) {
panic(badDstLength)
}
return f64.DivTo(dst, s, t)
}
// Dot computes the dot product of s1 and s2, i.e.
// sum_{i = 1}^N s1[i]*s2[i].
// It panics if the argument lengths do not match.
func Dot(s1, s2 []float64) float64 {
if len(s1) != len(s2) {
panic(badLength)
}
return f64.DotUnitary(s1, s2)
}
// Equal returns true when the slices have equal lengths and
// all elements are numerically identical.
func Equal(s1, s2 []float64) bool {
if len(s1) != len(s2) {
return false
}
for i, val := range s1 {
if s2[i] != val {
return false
}
}
return true
}
// EqualApprox returns true when the slices have equal lengths and
// all element pairs have an absolute tolerance less than tol or a
// relative tolerance less than tol.
func EqualApprox(s1, s2 []float64, tol float64) bool {
if len(s1) != len(s2) {
return false
}
for i, a := range s1 {
if !scalar.EqualWithinAbsOrRel(a, s2[i], tol, tol) {
return false
}
}
return true
}
// EqualFunc returns true when the slices have the same lengths
// and the function returns true for all element pairs.
func EqualFunc(s1, s2 []float64, f func(float64, float64) bool) bool {
if len(s1) != len(s2) {
return false
}
for i, val := range s1 {
if !f(val, s2[i]) {
return false
}
}
return true
}
// EqualLengths returns true when all of the slices have equal length,
// and false otherwise. It also returns true when there are no input slices.
func EqualLengths(slices ...[]float64) bool {
// This length check is needed: http://play.golang.org/p/sdty6YiLhM
if len(slices) == 0 {
return true
}
l := len(slices[0])
for i := 1; i < len(slices); i++ {
if len(slices[i]) != l {
return false
}
}
return true
}
// Find applies f to every element of s and returns the indices of the first
// k elements for which the f returns true, or all such elements
// if k < 0.
// Find will reslice inds to have 0 length, and will append
// found indices to inds.
// If k > 0 and there are fewer than k elements in s satisfying f,
// all of the found elements will be returned along with an error.
// At the return of the function, the input inds will be in an undetermined state.
func Find(inds []int, f func(float64) bool, s []float64, k int) ([]int, error) {
// inds is also returned to allow for calling with nil.
// Reslice inds to have zero length.
inds = inds[:0]
// If zero elements requested, can just return.
if k == 0 {
return inds, nil
}
// If k < 0, return all of the found indices.
if k < 0 {
for i, val := range s {
if f(val) {
inds = append(inds, i)
}
}
return inds, nil
}
// Otherwise, find the first k elements.
nFound := 0
for i, val := range s {
if f(val) {
inds = append(inds, i)
nFound++
if nFound == k {
return inds, nil
}
}
}
// Finished iterating over the loop, which means k elements were not found.
return inds, errors.New("floats: insufficient elements found")
}
// HasNaN returns true when the slice s has any values that are NaN and false
// otherwise.
func HasNaN(s []float64) bool {
for _, v := range s {
if math.IsNaN(v) {
return true
}
}
return false
}
// LogSpan returns a set of n equally spaced points in log space between,
// l and u where N is equal to len(dst). The first element of the
// resulting dst will be l and the final element of dst will be u.
// It panics if the length of dst is less than 2.
// Note that this call will return NaNs if either l or u are negative, and
// will return all zeros if l or u is zero.
// Also returns the mutated slice dst, so that it can be used in range, like:
//
// for i, x := range LogSpan(dst, l, u) { ... }
func LogSpan(dst []float64, l, u float64) []float64 {
Span(dst, math.Log(l), math.Log(u))
for i := range dst {
dst[i] = math.Exp(dst[i])
}
return dst
}
// LogSumExp returns the log of the sum of the exponentials of the values in s.
// Panics if s is an empty slice.
func LogSumExp(s []float64) float64 {
// Want to do this in a numerically stable way which avoids
// overflow and underflow
// First, find the maximum value in the slice.
maxval := Max(s)
if math.IsInf(maxval, 0) {
// If it's infinity either way, the logsumexp will be infinity as well
// returning now avoids NaNs
return maxval
}
var lse float64
// Compute the sumexp part
for _, val := range s {
lse += math.Exp(val - maxval)
}
// Take the log and add back on the constant taken out
return math.Log(lse) + maxval
}
// Max returns the maximum value in the input slice. If the slice is empty, Max will panic.
func Max(s []float64) float64 {
return s[MaxIdx(s)]
}
// MaxIdx returns the index of the maximum value in the input slice. If several
// entries have the maximum value, the first such index is returned.
// It panics if s is zero length.
func MaxIdx(s []float64) int {
if len(s) == 0 {
panic(zeroLength)
}
max := math.NaN()
var ind int
for i, v := range s {
if math.IsNaN(v) {
continue
}
if v > max || math.IsNaN(max) {
max = v
ind = i
}
}
return ind
}
// Min returns the minimum value in the input slice.
// It panics if s is zero length.
func Min(s []float64) float64 {
return s[MinIdx(s)]
}
// MinIdx returns the index of the minimum value in the input slice. If several
// entries have the minimum value, the first such index is returned.
// It panics if s is zero length.
func MinIdx(s []float64) int {
if len(s) == 0 {
panic(zeroLength)
}
min := math.NaN()
var ind int
for i, v := range s {
if math.IsNaN(v) {
continue
}
if v < min || math.IsNaN(min) {
min = v
ind = i
}
}
return ind
}
// Mul performs element-wise multiplication between dst
// and s and stores the value in dst.
// It panics if the argument lengths do not match.
func Mul(dst, s []float64) {
if len(dst) != len(s) {
panic(badLength)
}
for i, val := range s {
dst[i] *= val
}
}
// MulTo performs element-wise multiplication between s
// and t and stores the value in dst.
// It panics if the argument lengths do not match.
func MulTo(dst, s, t []float64) []float64 {
if len(s) != len(t) {
panic(badLength)
}
if len(dst) != len(s) {
panic(badDstLength)
}
for i, val := range t {
dst[i] = val * s[i]
}
return dst
}
// NearestIdx returns the index of the element in s
// whose value is nearest to v. If several such
// elements exist, the lowest index is returned.
// It panics if s is zero length.
func NearestIdx(s []float64, v float64) int {
if len(s) == 0 {
panic(zeroLength)
}
switch {
case math.IsNaN(v):
return 0
case math.IsInf(v, 1):
return MaxIdx(s)
case math.IsInf(v, -1):
return MinIdx(s)
}
var ind int
dist := math.NaN()
for i, val := range s {
newDist := math.Abs(v - val)
// A NaN distance will not be closer.
if math.IsNaN(newDist) {
continue
}
if newDist < dist || math.IsNaN(dist) {
dist = newDist
ind = i
}
}
return ind
}
// NearestIdxForSpan return the index of a hypothetical vector created
// by Span with length n and bounds l and u whose value is closest
// to v. That is, NearestIdxForSpan(n, l, u, v) is equivalent to
// Nearest(Span(make([]float64, n),l,u),v) without an allocation.
// It panics if n is less than two.
func NearestIdxForSpan(n int, l, u float64, v float64) int {
if n < 2 {
panic(shortSpan)
}
if math.IsNaN(v) {
return 0
}
// Special cases for Inf and NaN.
switch {
case math.IsNaN(l) && !math.IsNaN(u):
return n - 1
case math.IsNaN(u):
return 0
case math.IsInf(l, 0) && math.IsInf(u, 0):
if l == u {
return 0
}
if n%2 == 1 {
if !math.IsInf(v, 0) {
return n / 2
}
if math.Copysign(1, v) == math.Copysign(1, l) {
return 0
}
return n/2 + 1
}
if math.Copysign(1, v) == math.Copysign(1, l) {
return 0
}
return n / 2
case math.IsInf(l, 0):
if v == l {
return 0
}
return n - 1
case math.IsInf(u, 0):
if v == u {
return n - 1
}
return 0
case math.IsInf(v, -1):
if l <= u {
return 0
}
return n - 1
case math.IsInf(v, 1):
if u <= l {
return 0
}
return n - 1
}
// Special cases for v outside (l, u) and (u, l).
switch {
case l < u:
if v <= l {
return 0
}
if v >= u {
return n - 1
}
case l > u:
if v >= l {
return 0
}
if v <= u {
return n - 1
}
default:
return 0
}
// Can't guarantee anything about exactly halfway between
// because of floating point weirdness.
return int((float64(n)-1)/(u-l)*(v-l) + 0.5)
}
// Norm returns the L norm of the slice S, defined as
// (sum_{i=1}^N s[i]^L)^{1/L}
// Special cases:
// L = math.Inf(1) gives the maximum absolute value.
// Does not correctly compute the zero norm (use Count).
func Norm(s []float64, L float64) float64 {
// Should this complain if L is not positive?
// Should this be done in log space for better numerical stability?
// would be more cost
// maybe only if L is high?
if len(s) == 0 {
return 0
}
if L == 2 {
return f64.L2NormUnitary(s)
}
var norm float64
if L == 1 {
for _, val := range s {
norm += math.Abs(val)
}
return norm
}
if math.IsInf(L, 1) {
for _, val := range s {
norm = math.Max(norm, math.Abs(val))
}
return norm
}
for _, val := range s {
norm += math.Pow(math.Abs(val), L)
}
return math.Pow(norm, 1/L)
}
// Prod returns the product of the elements of the slice.
// Returns 1 if len(s) = 0.
func Prod(s []float64) float64 {
prod := 1.0
for _, val := range s {
prod *= val
}
return prod
}
// Reverse reverses the order of elements in the slice.
func Reverse(s []float64) {
for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 {
s[i], s[j] = s[j], s[i]
}
}
// Same returns true when the input slices have the same length and all
// elements have the same value with NaN treated as the same.
func Same(s, t []float64) bool {
if len(s) != len(t) {
return false
}
for i, v := range s {
w := t[i]
if v != w && !(math.IsNaN(v) && math.IsNaN(w)) {
return false
}
}
return true
}
// Scale multiplies every element in dst by the scalar c.
func Scale(c float64, dst []float64) {
if len(dst) > 0 {
f64.ScalUnitary(c, dst)
}
}
// ScaleTo multiplies the elements in s by c and stores the result in dst.
// It panics if the slice argument lengths do not match.
func ScaleTo(dst []float64, c float64, s []float64) []float64 {
if len(dst) != len(s) {
panic(badDstLength)
}
if len(dst) > 0 {
f64.ScalUnitaryTo(dst, c, s)
}
return dst
}
// Span returns a set of N equally spaced points between l and u, where N
// is equal to the length of the destination. The first element of the destination
// is l, the final element of the destination is u.
// It panics if the length of dst is less than 2.
//
// Span also returns the mutated slice dst, so that it can be used in range expressions,
// like:
//
// for i, x := range Span(dst, l, u) { ... }
func Span(dst []float64, l, u float64) []float64 {
n := len(dst)
if n < 2 {
panic(shortSpan)
}
// Special cases for Inf and NaN.
switch {
case math.IsNaN(l):
for i := range dst[:len(dst)-1] {
dst[i] = math.NaN()
}
dst[len(dst)-1] = u
return dst
case math.IsNaN(u):
for i := range dst[1:] {
dst[i+1] = math.NaN()
}
dst[0] = l
return dst
case math.IsInf(l, 0) && math.IsInf(u, 0):
for i := range dst[:len(dst)/2] {
dst[i] = l
dst[len(dst)-i-1] = u
}
if len(dst)%2 == 1 {
if l != u {
dst[len(dst)/2] = 0
} else {
dst[len(dst)/2] = l
}
}
return dst
case math.IsInf(l, 0):
for i := range dst[:len(dst)-1] {
dst[i] = l
}
dst[len(dst)-1] = u
return dst
case math.IsInf(u, 0):
for i := range dst[1:] {
dst[i+1] = u
}
dst[0] = l
return dst
}
step := (u - l) / float64(n-1)
for i := range dst {
dst[i] = l + step*float64(i)
}
return dst
}
// Sub subtracts, element-wise, the elements of s from dst.
// It panics if the argument lengths do not match.
func Sub(dst, s []float64) {
if len(dst) != len(s) {
panic(badLength)
}
f64.AxpyUnitaryTo(dst, -1, s, dst)
}
// SubTo subtracts, element-wise, the elements of t from s and
// stores the result in dst.
// It panics if the argument lengths do not match.
func SubTo(dst, s, t []float64) []float64 {
if len(s) != len(t) {
panic(badLength)
}
if len(dst) != len(s) {
panic(badDstLength)
}
f64.AxpyUnitaryTo(dst, -1, t, s)
return dst
}
// Sum returns the sum of the elements of the slice.
func Sum(s []float64) float64 {
return f64.Sum(s)
}
// Within returns the first index i where s[i] <= v < s[i+1]. Within panics if:
// - len(s) < 2
// - s is not sorted
func Within(s []float64, v float64) int {
if len(s) < 2 {
panic(shortSpan)
}
if !sort.Float64sAreSorted(s) {
panic("floats: input slice not sorted")
}
if v < s[0] || v >= s[len(s)-1] || math.IsNaN(v) {
return -1
}
for i, f := range s[1:] {
if v < f {
return i
}
}
return -1
}
// SumCompensated returns the sum of the elements of the slice calculated with greater
// accuracy than Sum at the expense of additional computation.
func SumCompensated(s []float64) float64 {
// SumCompensated uses an improved version of Kahan's compensated
// summation algorithm proposed by Neumaier.
// See https://en.wikipedia.org/wiki/Kahan_summation_algorithm for details.
var sum, c float64
for _, x := range s {
// This type conversion is here to prevent a sufficiently smart compiler
// from optimising away these operations.
t := float64(sum + x)
if math.Abs(sum) >= math.Abs(x) {
c += (sum - t) + x
} else {
c += (x - t) + sum
}
sum = t
}
return sum + c
}

6
vendor/gonum.org/v1/gonum/floats/scalar/doc.go generated vendored Normal file
View File

@@ -0,0 +1,6 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package scalar provides a set of helper routines for dealing with float64 values.
package scalar // import "gonum.org/v1/gonum/floats/scalar"

171
vendor/gonum.org/v1/gonum/floats/scalar/scalar.go generated vendored Normal file
View File

@@ -0,0 +1,171 @@
// Copyright ©2013 The Gonum Authors. All rights reserved.
// Use of this code is governed by a BSD-style
// license that can be found in the LICENSE file.
package scalar
import (
"math"
"strconv"
)
// EqualWithinAbs returns true when a and b have an absolute difference
// not greater than tol.
func EqualWithinAbs(a, b, tol float64) bool {
return a == b || math.Abs(a-b) <= tol
}
// minNormalFloat64 is the smallest normal number. For 64 bit IEEE-754
// floats this is 2^{-1022}.
const minNormalFloat64 = 0x1p-1022
// EqualWithinRel returns true when the difference between a and b
// is not greater than tol times the greater absolute value of a and b,
//
// abs(a-b) <= tol * max(abs(a), abs(b)).
func EqualWithinRel(a, b, tol float64) bool {
if a == b {
return true
}
delta := math.Abs(a - b)
if delta <= minNormalFloat64 {
return delta <= tol*minNormalFloat64
}
// We depend on the division in this relationship to identify
// infinities (we rely on the NaN to fail the test) otherwise
// we compare Infs of the same sign and evaluate Infs as equal
// independent of sign.
return delta/math.Max(math.Abs(a), math.Abs(b)) <= tol
}
// EqualWithinAbsOrRel returns true when a and b are equal to within
// the absolute or relative tolerances. See EqualWithinAbs and
// EqualWithinRel for details.
func EqualWithinAbsOrRel(a, b, absTol, relTol float64) bool {
return EqualWithinAbs(a, b, absTol) || EqualWithinRel(a, b, relTol)
}
// EqualWithinULP returns true when a and b are equal to within
// the specified number of floating point units in the last place.
func EqualWithinULP(a, b float64, ulp uint) bool {
if a == b {
return true
}
if math.IsNaN(a) || math.IsNaN(b) {
return false
}
if math.Signbit(a) != math.Signbit(b) {
return math.Float64bits(math.Abs(a))+math.Float64bits(math.Abs(b)) <= uint64(ulp)
}
return ulpDiff(math.Float64bits(a), math.Float64bits(b)) <= uint64(ulp)
}
func ulpDiff(a, b uint64) uint64 {
if a > b {
return a - b
}
return b - a
}
const (
nanBits = 0x7ff8000000000000
nanMask = 0xfff8000000000000
)
// NaNWith returns an IEEE 754 "quiet not-a-number" value with the
// payload specified in the low 51 bits of payload.
// The NaN returned by math.NaN has a bit pattern equal to NaNWith(1).
func NaNWith(payload uint64) float64 {
return math.Float64frombits(nanBits | (payload &^ nanMask))
}
// NaNPayload returns the lowest 51 bits payload of an IEEE 754 "quiet
// not-a-number". For values of f other than quiet-NaN, NaNPayload
// returns zero and false.
func NaNPayload(f float64) (payload uint64, ok bool) {
b := math.Float64bits(f)
if b&nanBits != nanBits {
return 0, false
}
return b &^ nanMask, true
}
// ParseWithNA converts the string s to a float64 in value.
// If s equals missing, weight is returned as 0, otherwise 1.
func ParseWithNA(s, missing string) (value, weight float64, err error) {
if s == missing {
return 0, 0, nil
}
value, err = strconv.ParseFloat(s, 64)
if err == nil {
weight = 1
}
return value, weight, err
}
// Round returns the half away from zero rounded value of x with prec precision.
//
// Special cases are:
//
// Round(±0) = +0
// Round(±Inf) = ±Inf
// Round(NaN) = NaN
func Round(x float64, prec int) float64 {
if x == 0 {
// Make sure zero is returned
// without the negative bit set.
return 0
}
// Fast path for positive precision on integers.
if prec >= 0 && x == math.Trunc(x) {
return x
}
pow := math.Pow10(prec)
intermed := x * pow
if math.IsInf(intermed, 0) {
return x
}
x = math.Round(intermed)
if x == 0 {
return 0
}
return x / pow
}
// RoundEven returns the half even rounded value of x with prec precision.
//
// Special cases are:
//
// RoundEven(±0) = +0
// RoundEven(±Inf) = ±Inf
// RoundEven(NaN) = NaN
func RoundEven(x float64, prec int) float64 {
if x == 0 {
// Make sure zero is returned
// without the negative bit set.
return 0
}
// Fast path for positive precision on integers.
if prec >= 0 && x == math.Trunc(x) {
return x
}
pow := math.Pow10(prec)
intermed := x * pow
if math.IsInf(intermed, 0) {
return x
}
x = math.RoundToEven(intermed)
if x == 0 {
return 0
}
return x / pow
}
// Same returns true when the inputs have the same value, allowing NaN equality.
func Same(a, b float64) bool {
return a == b || (math.IsNaN(a) && math.IsNaN(b))
}

View File

@@ -0,0 +1,134 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// MOVDDUP X2, X3
#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
// MOVDDUP X4, X5
#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
// MOVDDUP X6, X7
#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
// MOVDDUP X8, X9
#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
// ADDSUBPD X2, X3
#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// ADDSUBPD X4, X5
#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// ADDSUBPD X6, X7
#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// ADDSUBPD X8, X9
#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyInc(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SI // SI = &x
MOVQ y_base+40(FP), DI // DI = &y
MOVQ n+64(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JE axpyi_end
MOVQ ix+88(FP), R8 // R8 = ix // Load the first index
SHLQ $4, R8 // R8 *= sizeof(complex128)
MOVQ iy+96(FP), R9 // R9 = iy
SHLQ $4, R9 // R9 *= sizeof(complex128)
LEAQ (SI)(R8*1), SI // SI = &(x[ix])
LEAQ (DI)(R9*1), DI // DI = &(y[iy])
MOVQ DI, DX // DX = DI // Separate Read/Write pointers
MOVQ incX+72(FP), R8 // R8 = incX
SHLQ $4, R8 // R8 *= sizeof(complex128)
MOVQ incY+80(FP), R9 // R9 = iy
SHLQ $4, R9 // R9 *= sizeof(complex128)
MOVUPS alpha+0(FP), X0 // X0 = { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) }
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX // CX = n % 4
SHRQ $2, BX // BX = floor( n / 4 )
JZ axpyi_tail // if BX == 0 { goto axpyi_tail }
axpyi_loop: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS (SI)(R8*1), X4
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVUPS (SI), X6
MOVUPS (SI)(R8*1), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX), X3
ADDPD (DX)(R9*1), X5
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
ADDPD (DX), X7
ADDPD (DX)(R9*1), X9
MOVUPS X3, (DI) // dst[i] = X_(i+1)
MOVUPS X5, (DI)(R9*1)
LEAQ (DI)(R9*2), DI
MOVUPS X7, (DI)
MOVUPS X9, (DI)(R9*1)
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
DECQ BX
JNZ axpyi_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE axpyi_end
axpyi_tail: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DI), X3
MOVUPS X3, (DI) // y[i] = X_i
ADDQ R8, SI // SI = &(SI[incX])
ADDQ R9, DI // DI = &(DI[incY])
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -0,0 +1,141 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// MOVDDUP X2, X3
#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
// MOVDDUP X4, X5
#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
// MOVDDUP X6, X7
#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
// MOVDDUP X8, X9
#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
// ADDSUBPD X2, X3
#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// ADDSUBPD X4, X5
#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// ADDSUBPD X6, X7
#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// ADDSUBPD X8, X9
#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyIncTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ x_base+56(FP), SI // SI = &x
MOVQ y_base+80(FP), DX // DX = &y
MOVQ n+104(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JE axpyi_end
MOVQ ix+128(FP), R8 // R8 = ix // Load the first index
SHLQ $4, R8 // R8 *= sizeof(complex128)
MOVQ iy+136(FP), R9 // R9 = iy
SHLQ $4, R9 // R9 *= sizeof(complex128)
MOVQ idst+32(FP), R10 // R10 = idst
SHLQ $4, R10 // R10 *= sizeof(complex128)
LEAQ (SI)(R8*1), SI // SI = &(x[ix])
LEAQ (DX)(R9*1), DX // DX = &(y[iy])
LEAQ (DI)(R10*1), DI // DI = &(dst[idst])
MOVQ incX+112(FP), R8 // R8 = incX
SHLQ $4, R8 // R8 *= sizeof(complex128)
MOVQ incY+120(FP), R9 // R9 = incY
SHLQ $4, R9 // R9 *= sizeof(complex128)
MOVQ incDst+24(FP), R10 // R10 = incDst
SHLQ $4, R10 // R10 *= sizeof(complex128)
MOVUPS alpha+40(FP), X0 // X0 = { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) }
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX // CX = n % 4
SHRQ $2, BX // BX = floor( n / 4 )
JZ axpyi_tail // if BX == 0 { goto axpyi_tail }
axpyi_loop: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS (SI)(R8*1), X4
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVUPS (SI), X6
MOVUPS (SI)(R8*1), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX), X3
ADDPD (DX)(R9*1), X5
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
ADDPD (DX), X7
ADDPD (DX)(R9*1), X9
MOVUPS X3, (DI) // dst[i] = X_(i+1)
MOVUPS X5, (DI)(R10*1)
LEAQ (DI)(R10*2), DI
MOVUPS X7, (DI)
MOVUPS X9, (DI)(R10*1)
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2])
DECQ BX
JNZ axpyi_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE axpyi_end
axpyi_tail: // do {
MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX), X3
MOVUPS X3, (DI) // y[i] X_(i+1)
ADDQ R8, SI // SI += incX
ADDQ R9, DX // DX += incY
ADDQ R10, DI // DI += incDst
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -0,0 +1,122 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// MOVDDUP X2, X3
#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
// MOVDDUP X4, X5
#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
// MOVDDUP X6, X7
#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
// MOVDDUP X8, X9
#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
// ADDSUBPD X2, X3
#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// ADDSUBPD X4, X5
#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// ADDSUBPD X6, X7
#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// ADDSUBPD X8, X9
#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyUnitary(alpha complex128, x, y []complex128)
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SI // SI = &x
MOVQ y_base+40(FP), DI // DI = &y
MOVQ x_len+24(FP), CX // CX = min( len(x), len(y) )
CMPQ y_len+48(FP), CX
CMOVQLE y_len+48(FP), CX
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
PXOR X0, X0 // Clear work registers and cache-align loop
PXOR X1, X1
MOVUPS alpha+0(FP), X0 // X0 = { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) }
XORQ AX, AX // i = 0
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX // CX = n % 4
SHRQ $2, BX // BX = floor( n / 4 )
JZ caxy_tail // if BX == 0 { goto caxy_tail }
caxy_loop: // do {
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS 16(SI)(AX*8), X4
MOVUPS 32(SI)(AX*8), X6
MOVUPS 48(SI)(AX*8), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DI)(AX*8), X3
ADDPD 16(DI)(AX*8), X5
ADDPD 32(DI)(AX*8), X7
ADDPD 48(DI)(AX*8), X9
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
MOVUPS X5, 16(DI)(AX*8)
MOVUPS X7, 32(DI)(AX*8)
MOVUPS X9, 48(DI)(AX*8)
ADDQ $8, AX // i += 8
DECQ BX
JNZ caxy_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
caxy_tail: // do {
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DI)(AX*8), X3
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
ADDQ $2, AX // i += 2
LOOP caxy_tail // } while --CX > 0
caxy_end:
RET

View File

@@ -0,0 +1,123 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// MOVDDUP X2, X3
#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
// MOVDDUP X4, X5
#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
// MOVDDUP X6, X7
#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
// MOVDDUP X8, X9
#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
// ADDSUBPD X2, X3
#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// ADDSUBPD X4, X5
#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// ADDSUBPD X6, X7
#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// ADDSUBPD X8, X9
#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128)
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ x_base+40(FP), SI // SI = &x
MOVQ y_base+64(FP), DX // DX = &y
MOVQ x_len+48(FP), CX // CX = min( len(x), len(y), len(dst) )
CMPQ y_len+72(FP), CX
CMOVQLE y_len+72(FP), CX
CMPQ dst_len+8(FP), CX
CMOVQLE dst_len+8(FP), CX
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
MOVUPS alpha+24(FP), X0 // X0 = { imag(a), real(a) }
MOVAPS X0, X1
SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) }
XORQ AX, AX // i = 0
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX // CX = n % 4
SHRQ $2, BX // BX = floor( n / 4 )
JZ caxy_tail // if BX == 0 { goto caxy_tail }
caxy_loop: // do {
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS 16(SI)(AX*8), X4
MOVUPS 32(SI)(AX*8), X6
MOVUPS 48(SI)(AX*8), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X1, X2
MULPD X0, X3
MULPD X11, X4
MULPD X10, X5
MULPD X1, X6
MULPD X0, X7
MULPD X11, X8
MULPD X10, X9
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX)(AX*8), X3
ADDPD 16(DX)(AX*8), X5
ADDPD 32(DX)(AX*8), X7
ADDPD 48(DX)(AX*8), X9
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
MOVUPS X5, 16(DI)(AX*8)
MOVUPS X7, 32(DI)(AX*8)
MOVUPS X9, 48(DI)(AX*8)
ADDQ $8, AX // i += 8
DECQ BX
JNZ caxy_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
caxy_tail: // Same calculation, but read in values to avoid trampling memory
MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
ADDPD (DX)(AX*8), X3
MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
ADDQ $2, AX // i += 2
LOOP caxy_tail // } while --CX > 0
caxy_end:
RET

6
vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go generated vendored Normal file
View File

@@ -0,0 +1,6 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package c128 provides complex128 vector primitives.
package c128 // import "gonum.org/v1/gonum/internal/asm/c128"

View File

@@ -0,0 +1,153 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define MOVDDUP_XPTR__X3 LONG $0x1E120FF2 // MOVDDUP (SI), X3
#define MOVDDUP_XPTR_INCX__X5 LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5
#define MOVDDUP_XPTR_INCX_2__X7 LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7
#define MOVDDUP_XPTR_INCx3X__X9 LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9
#define MOVDDUP_8_XPTR__X2 LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2
#define MOVDDUP_8_XPTR_INCX__X4 LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4
#define MOVDDUP_8_XPTR_INCX_2__X6 LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6
#define MOVDDUP_8_XPTR_INCx3X__X8 LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define INC_X R8
#define INCx3_X R9
#define INC_Y R10
#define INCx3_Y R11
#define NEG1 X15
#define P_NEG1 X14
// func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
TEXT ·DotcInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ n+48(FP), LEN // LEN = n
PXOR SUM, SUM // SUM = 0
CMPQ LEN, $0 // if LEN == 0 { return }
JE dot_end
PXOR P_SUM, P_SUM // P_SUM = 0
MOVQ ix+72(FP), INC_X // INC_X = ix * sizeof(complex128)
SHLQ $4, INC_X
MOVQ iy+80(FP), INC_Y // INC_Y = iy * sizeof(complex128)
SHLQ $4, INC_Y
LEAQ (X_PTR)(INC_X*1), X_PTR // X_PTR = &(X_PTR[ix])
LEAQ (Y_PTR)(INC_Y*1), Y_PTR // Y_PTR = &(Y_PTR[iy])
MOVQ incX+56(FP), INC_X // INC_X = incX
SHLQ $4, INC_X // INC_X *= sizeof(complex128)
MOVQ incY+64(FP), INC_Y // INC_Y = incY
SHLQ $4, INC_Y // INC_Y *= sizeof(complex128)
MOVSD $(-1.0), NEG1
SHUFPD $0, NEG1, NEG1 // { -1, -1 }
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = n % 4
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ dot_tail // if n <= 4 { goto dot_tail }
MOVAPS NEG1, P_NEG1 // Copy NEG1 to P_NEG1 for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128)
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128)
dot_loop: // do {
MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_XPTR_INCX__X5
MOVDDUP_XPTR_INCX_2__X7
MOVDDUP_XPTR_INCx3X__X9
MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) }
MOVDDUP_8_XPTR_INCX__X4
MOVDDUP_8_XPTR_INCX_2__X6
MOVDDUP_8_XPTR_INCx3X__X8
// X_i = { -imag(x[i]), -imag(x[i]) }
MULPD NEG1, X2
MULPD P_NEG1, X4
MULPD NEG1, X6
MULPD P_NEG1, X8
// X_j = { imag(y[i]), real(y[i]) }
MOVUPS (Y_PTR), X10
MOVUPS (Y_PTR)(INC_Y*1), X11
MOVUPS (Y_PTR)(INC_Y*2), X12
MOVUPS (Y_PTR)(INCx3_Y*1), X13
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X10, X3
MULPD X11, X5
MULPD X12, X7
MULPD X13, X9
// X_j = { real(y[i]), imag(y[i]) }
SHUFPD $0x1, X10, X10
SHUFPD $0x1, X11, X11
SHUFPD $0x1, X12, X12
SHUFPD $0x1, X13, X13
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X10, X2
MULPD X11, X4
MULPD X12, X6
MULPD X13, X8
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// psum += result[i]
ADDPD X3, SUM
ADDPD X5, P_SUM
ADDPD X7, SUM
ADDPD X9, P_SUM
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDPD P_SUM, SUM // sum += psum
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD NEG1, X2 // X_i = { -imag(x[i]) , -imag(x[i]) }
MOVUPS (Y_PTR), X10 // X_j = { imag(y[i]) , real(y[i]) }
MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) }
MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDPD X3, SUM // sum += result[i]
ADDQ INC_X, X_PTR // X_PTR += incX
ADDQ INC_Y, Y_PTR // Y_PTR += incY
DECQ TAIL
JNZ dot_tail // } while --TAIL > 0
dot_end:
MOVUPS SUM, sum+88(FP)
RET

View File

@@ -0,0 +1,143 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define MOVDDUP_XPTR_IDX_8__X3 LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3
#define MOVDDUP_16_XPTR_IDX_8__X5 LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5
#define MOVDDUP_32_XPTR_IDX_8__X7 LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7
#define MOVDDUP_48_XPTR_IDX_8__X9 LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9
#define MOVDDUP_XPTR_IIDX_8__X2 LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2
#define MOVDDUP_16_XPTR_IIDX_8__X4 LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4
#define MOVDDUP_32_XPTR_IIDX_8__X6 LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6
#define MOVDDUP_48_XPTR_IIDX_8__X8 LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define IDX AX
#define I_IDX DX
#define NEG1 X15
#define P_NEG1 X14
// func DotcUnitary(x, y []complex128) (sum complex128)
TEXT ·DotcUnitary(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+32(FP), LEN
CMOVQLE y_len+32(FP), LEN
PXOR SUM, SUM // sum = 0
CMPQ LEN, $0 // if LEN == 0 { return }
JE dot_end
XORPS P_SUM, P_SUM // psum = 0
MOVSD $(-1.0), NEG1
SHUFPD $0, NEG1, NEG1 // { -1, -1 }
XORQ IDX, IDX // i := 0
MOVQ $1, I_IDX // j := 1
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = floor( TAIL / 4 )
SHRQ $2, LEN // LEN = TAIL % 4
JZ dot_tail // if LEN == 0 { goto dot_tail }
MOVAPS NEG1, P_NEG1 // Copy NEG1 to P_NEG1 for pipelining
dot_loop: // do {
MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_16_XPTR_IDX_8__X5
MOVDDUP_32_XPTR_IDX_8__X7
MOVDDUP_48_XPTR_IDX_8__X9
MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]), imag(x[i]) }
MOVDDUP_16_XPTR_IIDX_8__X4
MOVDDUP_32_XPTR_IIDX_8__X6
MOVDDUP_48_XPTR_IIDX_8__X8
// X_i = { -imag(x[i]), -imag(x[i]) }
MULPD NEG1, X2
MULPD P_NEG1, X4
MULPD NEG1, X6
MULPD P_NEG1, X8
// X_j = { imag(y[i]), real(y[i]) }
MOVUPS (Y_PTR)(IDX*8), X10
MOVUPS 16(Y_PTR)(IDX*8), X11
MOVUPS 32(Y_PTR)(IDX*8), X12
MOVUPS 48(Y_PTR)(IDX*8), X13
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X10, X3
MULPD X11, X5
MULPD X12, X7
MULPD X13, X9
// X_j = { real(y[i]), imag(y[i]) }
SHUFPD $0x1, X10, X10
SHUFPD $0x1, X11, X11
SHUFPD $0x1, X12, X12
SHUFPD $0x1, X13, X13
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X10, X2
MULPD X11, X4
MULPD X12, X6
MULPD X13, X8
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// psum += result[i]
ADDPD X3, SUM
ADDPD X5, P_SUM
ADDPD X7, SUM
ADDPD X9, P_SUM
ADDQ $8, IDX // IDX += 8
ADDQ $8, I_IDX // I_IDX += 8
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDPD P_SUM, SUM // sum += psum
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i]) , real(x[i]) }
MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]) , imag(x[i]) }
MULPD NEG1, X2 // X_i = { -imag(x[i]) , -imag(x[i]) }
MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]) , real(y[i]) }
MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) }
MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDPD X3, SUM // SUM += result[i]
ADDQ $2, IDX // IDX += 2
ADDQ $2, I_IDX // I_IDX += 2
DECQ TAIL
JNZ dot_tail // } while --TAIL > 0
dot_end:
MOVUPS SUM, sum+48(FP)
RET

View File

@@ -0,0 +1,141 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define MOVDDUP_XPTR__X3 LONG $0x1E120FF2 // MOVDDUP (SI), X3
#define MOVDDUP_XPTR_INCX__X5 LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5
#define MOVDDUP_XPTR_INCX_2__X7 LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7
#define MOVDDUP_XPTR_INCx3X__X9 LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9
#define MOVDDUP_8_XPTR__X2 LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2
#define MOVDDUP_8_XPTR_INCX__X4 LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4
#define MOVDDUP_8_XPTR_INCX_2__X6 LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6
#define MOVDDUP_8_XPTR_INCx3X__X8 LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define INC_X R8
#define INCx3_X R9
#define INC_Y R10
#define INCx3_Y R11
// func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
TEXT ·DotuInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ n+48(FP), LEN // LEN = n
PXOR SUM, SUM // sum = 0
CMPQ LEN, $0 // if LEN == 0 { return }
JE dot_end
MOVQ ix+72(FP), INC_X // INC_X = ix * sizeof(complex128)
SHLQ $4, INC_X
MOVQ iy+80(FP), INC_Y // INC_Y = iy * sizeof(complex128)
SHLQ $4, INC_Y
LEAQ (X_PTR)(INC_X*1), X_PTR // X_PTR = &(X_PTR[ix])
LEAQ (Y_PTR)(INC_Y*1), Y_PTR // Y_PTR = &(Y_PTR[iy])
MOVQ incX+56(FP), INC_X // INC_X = incX
SHLQ $4, INC_X // INC_X *= sizeof(complex128)
MOVQ incY+64(FP), INC_Y // INC_Y = incY
SHLQ $4, INC_Y // INC_Y *= sizeof(complex128)
MOVQ LEN, TAIL
ANDQ $3, TAIL // LEN = LEN % 4
SHRQ $2, LEN // LEN = floor( LEN / 4 )
JZ dot_tail // if LEN <= 4 { goto dot_tail }
PXOR P_SUM, P_SUM // psum = 0
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128)
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128)
dot_loop: // do {
MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_XPTR_INCX__X5
MOVDDUP_XPTR_INCX_2__X7
MOVDDUP_XPTR_INCx3X__X9
MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) }
MOVDDUP_8_XPTR_INCX__X4
MOVDDUP_8_XPTR_INCX_2__X6
MOVDDUP_8_XPTR_INCx3X__X8
// X_j = { imag(y[i]), real(y[i]) }
MOVUPS (Y_PTR), X10
MOVUPS (Y_PTR)(INC_Y*1), X11
MOVUPS (Y_PTR)(INC_Y*2), X12
MOVUPS (Y_PTR)(INCx3_Y*1), X13
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X10, X3
MULPD X11, X5
MULPD X12, X7
MULPD X13, X9
// X_j = { real(y[i]), imag(y[i]) }
SHUFPD $0x1, X10, X10
SHUFPD $0x1, X11, X11
SHUFPD $0x1, X12, X12
SHUFPD $0x1, X13, X13
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X10, X2
MULPD X11, X4
MULPD X12, X6
MULPD X13, X8
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// psum += result[i]
ADDPD X3, SUM
ADDPD X5, P_SUM
ADDPD X7, SUM
ADDPD X9, P_SUM
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
DECQ LEN
JNZ dot_loop // } while --BX > 0
ADDPD P_SUM, SUM // sum += psum
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
MOVDDUP_XPTR__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_8_XPTR__X2 // X_i = { imag(x[i]), imag(x[i]) }
MOVUPS (Y_PTR), X10 // X_j = { imag(y[i]) , real(y[i]) }
MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) }
MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDPD X3, SUM // sum += result[i]
ADDQ INC_X, X_PTR // X_PTR += incX
ADDQ INC_Y, Y_PTR // Y_PTR += incY
DECQ TAIL // --TAIL
JNZ dot_tail // } while TAIL > 0
dot_end:
MOVUPS SUM, sum+88(FP)
RET

View File

@@ -0,0 +1,130 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define MOVDDUP_XPTR_IDX_8__X3 LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3
#define MOVDDUP_16_XPTR_IDX_8__X5 LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5
#define MOVDDUP_32_XPTR_IDX_8__X7 LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7
#define MOVDDUP_48_XPTR_IDX_8__X9 LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9
#define MOVDDUP_XPTR_IIDX_8__X2 LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2
#define MOVDDUP_16_XPTR_IIDX_8__X4 LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4
#define MOVDDUP_32_XPTR_IIDX_8__X6 LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6
#define MOVDDUP_48_XPTR_IIDX_8__X8 LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define IDX AX
#define I_IDX DX
// func DotuUnitary(x, y []complex128) (sum complex128)
TEXT ·DotuUnitary(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+32(FP), LEN
CMOVQLE y_len+32(FP), LEN
PXOR SUM, SUM // SUM = 0
CMPQ LEN, $0 // if LEN == 0 { return }
JE dot_end
PXOR P_SUM, P_SUM // P_SUM = 0
XORQ IDX, IDX // IDX = 0
MOVQ $1, DX // j = 1
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = floor( LEN / 4 )
SHRQ $2, LEN // LEN = LEN % 4
JZ dot_tail // if LEN == 0 { goto dot_tail }
dot_loop: // do {
MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_16_XPTR_IDX_8__X5
MOVDDUP_32_XPTR_IDX_8__X7
MOVDDUP_48_XPTR_IDX_8__X9
MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]), imag(x[i]) }
MOVDDUP_16_XPTR_IIDX_8__X4
MOVDDUP_32_XPTR_IIDX_8__X6
MOVDDUP_48_XPTR_IIDX_8__X8
// X_j = { imag(y[i]), real(y[i]) }
MOVUPS (Y_PTR)(IDX*8), X10
MOVUPS 16(Y_PTR)(IDX*8), X11
MOVUPS 32(Y_PTR)(IDX*8), X12
MOVUPS 48(Y_PTR)(IDX*8), X13
// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPD X10, X3
MULPD X11, X5
MULPD X12, X7
MULPD X13, X9
// X_j = { real(y[i]), imag(y[i]) }
SHUFPD $0x1, X10, X10
SHUFPD $0x1, X11, X11
SHUFPD $0x1, X12, X12
SHUFPD $0x1, X13, X13
// X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPD X10, X2
MULPD X11, X4
MULPD X12, X6
MULPD X13, X8
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
// psum += result[i]
ADDPD X3, SUM
ADDPD X5, P_SUM
ADDPD X7, SUM
ADDPD X9, P_SUM
ADDQ $8, IDX // IDX += 8
ADDQ $8, I_IDX // I_IDX += 8
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDPD P_SUM, SUM // SUM += P_SUM
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
MOVDDUP_XPTR_IDX_8__X3 // X_(i+1) = { real(x[i] , real(x[i]) }
MOVDDUP_XPTR_IIDX_8__X2 // X_i = { imag(x[i]) , imag(x[i]) }
MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]) , real(y[i]) }
MULPD X10, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) }
SHUFPD $0x1, X10, X10 // X_j = { real(y[i]) , imag(y[i]) }
MULPD X10, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDPD X3, SUM // psum += result[i]
ADDQ $2, IDX // IDX += 2
ADDQ $2, I_IDX // I_IDX += 2
DECQ TAIL // --TAIL
JNZ dot_tail // } while TAIL > 0
dot_end:
MOVUPS SUM, sum+48(FP)
RET

View File

@@ -0,0 +1,69 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define SRC SI
#define DST SI
#define LEN CX
#define TAIL BX
#define INC R9
#define INC3 R10
#define ALPHA X0
#define ALPHA_2 X1
#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0
// func DscalInc(alpha float64, x []complex128, n, inc uintptr)
TEXT ·DscalInc(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SRC // SRC = &x
MOVQ n+32(FP), LEN // LEN = n
CMPQ LEN, $0 // if LEN == 0 { return }
JE dscal_end
MOVDDUP_ALPHA // ALPHA = alpha
MOVQ inc+40(FP), INC // INC = inc
SHLQ $4, INC // INC = INC * sizeof(complex128)
LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC
MOVUPS ALPHA, ALPHA_2 // Copy ALPHA and ALPHA_2 for pipelining
MOVQ LEN, TAIL // TAIL = LEN
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ dscal_tail // if LEN == 0 { goto dscal_tail }
dscal_loop: // do {
MOVUPS (SRC), X2 // X_i = x[i]
MOVUPS (SRC)(INC*1), X3
MOVUPS (SRC)(INC*2), X4
MOVUPS (SRC)(INC3*1), X5
MULPD ALPHA, X2 // X_i *= ALPHA
MULPD ALPHA_2, X3
MULPD ALPHA, X4
MULPD ALPHA_2, X5
MOVUPS X2, (DST) // x[i] = X_i
MOVUPS X3, (DST)(INC*1)
MOVUPS X4, (DST)(INC*2)
MOVUPS X5, (DST)(INC3*1)
LEAQ (SRC)(INC*4), SRC // SRC += INC*4
DECQ LEN
JNZ dscal_loop // } while --LEN > 0
dscal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JE dscal_end // if TAIL == 0 { return }
dscal_tail_loop: // do {
MOVUPS (SRC), X2 // X_i = x[i]
MULPD ALPHA, X2 // X_i *= ALPHA
MOVUPS X2, (DST) // x[i] = X_i
ADDQ INC, SRC // SRC += INC
DECQ TAIL
JNZ dscal_tail_loop // } while --TAIL > 0
dscal_end:
RET

View File

@@ -0,0 +1,66 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define SRC SI
#define DST SI
#define LEN CX
#define IDX AX
#define TAIL BX
#define ALPHA X0
#define ALPHA_2 X1
#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0
// func DscalUnitary(alpha float64, x []complex128)
TEXT ·DscalUnitary(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SRC // SRC = &x
MOVQ x_len+16(FP), LEN // LEN = len(x)
CMPQ LEN, $0 // if LEN == 0 { return }
JE dscal_end
MOVDDUP_ALPHA // ALPHA = alpha
XORQ IDX, IDX // IDX = 0
MOVUPS ALPHA, ALPHA_2 // Copy ALPHA to ALPHA_2 for pipelining
MOVQ LEN, TAIL // TAIL = LEN
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ dscal_tail // if LEN == 0 { goto dscal_tail }
dscal_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = x[i]
MOVUPS 16(SRC)(IDX*8), X3
MOVUPS 32(SRC)(IDX*8), X4
MOVUPS 48(SRC)(IDX*8), X5
MULPD ALPHA, X2 // X_i *= ALPHA
MULPD ALPHA_2, X3
MULPD ALPHA, X4
MULPD ALPHA_2, X5
MOVUPS X2, (DST)(IDX*8) // x[i] = X_i
MOVUPS X3, 16(DST)(IDX*8)
MOVUPS X4, 32(DST)(IDX*8)
MOVUPS X5, 48(DST)(IDX*8)
ADDQ $8, IDX // IDX += 8
DECQ LEN
JNZ dscal_loop // } while --LEN > 0
dscal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JZ dscal_end // if TAIL == 0 { return }
dscal_tail_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = x[i]
MULPD ALPHA, X2 // X_i *= ALPHA
MOVUPS X2, (DST)(IDX*8) // x[i] = X_i
ADDQ $2, IDX // IDX += 2
DECQ TAIL
JNZ dscal_tail_loop // } while --TAIL > 0
dscal_end:
RET

33
vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go generated vendored Normal file
View File

@@ -0,0 +1,33 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package c128
// ScalUnitaryTo is
//
// for i, v := range x {
// dst[i] = alpha * v
// }
func ScalUnitaryTo(dst []complex128, alpha complex128, x []complex128) {
for i, v := range x {
dst[i] = alpha * v
}
}
// ScalIncTo is
//
// var idst, ix uintptr
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha * x[ix]
// ix += incX
// idst += incDst
// }
func ScalIncTo(dst []complex128, incDst uintptr, alpha complex128, x []complex128, n, incX uintptr) {
var idst, ix uintptr
for i := 0; i < int(n); i++ {
dst[idst] = alpha * x[ix]
ix += incX
idst += incDst
}
}

View File

@@ -0,0 +1,116 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define SRC SI
#define DST SI
#define LEN CX
#define IDX AX
#define TAIL BX
#define ALPHA X0
#define ALPHA_C X1
#define ALPHA2 X10
#define ALPHA_C2 X11
#define MOVDDUP_X2_X3 LONG $0xDA120FF2 // MOVDDUP X2, X3
#define MOVDDUP_X4_X5 LONG $0xEC120FF2 // MOVDDUP X4, X5
#define MOVDDUP_X6_X7 LONG $0xFE120FF2 // MOVDDUP X6, X7
#define MOVDDUP_X8_X9 LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
// func ScalUnitary(alpha complex128, x []complex128)
TEXT ·ScalUnitary(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SRC // SRC = &x
MOVQ x_len+24(FP), LEN // LEN = len(x)
CMPQ LEN, $0 // if LEN == 0 { return }
JE scal_end
MOVUPS alpha+0(FP), ALPHA // ALPHA = { imag(alpha), real(alpha) }
MOVAPS ALPHA, ALPHA_C
SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }
XORQ IDX, IDX // IDX = 0
MOVAPS ALPHA, ALPHA2 // Copy ALPHA and ALPHA_C for pipelining
MOVAPS ALPHA_C, ALPHA_C2
MOVQ LEN, TAIL
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ scal_tail // if BX == 0 { goto scal_tail }
scal_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS 16(SRC)(IDX*8), X4
MOVUPS 32(SRC)(IDX*8), X6
MOVUPS 48(SRC)(IDX*8), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) }
// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) }
MULPD ALPHA_C, X2
MULPD ALPHA, X3
MULPD ALPHA_C2, X4
MULPD ALPHA2, X5
MULPD ALPHA_C, X6
MULPD ALPHA, X7
MULPD ALPHA_C2, X8
MULPD ALPHA2, X9
// X_(i+1) = {
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1)
MOVUPS X5, 16(DST)(IDX*8)
MOVUPS X7, 32(DST)(IDX*8)
MOVUPS X9, 48(DST)(IDX*8)
ADDQ $8, IDX // IDX += 8
DECQ LEN
JNZ scal_loop // } while --LEN > 0
scal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JZ scal_end // if TAIL == 0 { return }
scal_tail_loop: // do {
MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD ALPHA_C, X2 // X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) }
MULPD ALPHA, X3 // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
// }
ADDSUBPD_X2_X3
MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1)
ADDQ $2, IDX // IDX += 2
DECQ TAIL
JNZ scal_tail_loop // } while --LEN > 0
scal_end:
RET

View File

@@ -0,0 +1,121 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define SRC SI
#define DST SI
#define LEN CX
#define TAIL BX
#define INC R9
#define INC3 R10
#define ALPHA X0
#define ALPHA_C X1
#define ALPHA2 X10
#define ALPHA_C2 X11
#define MOVDDUP_X2_X3 LONG $0xDA120FF2 // MOVDDUP X2, X3
#define MOVDDUP_X4_X5 LONG $0xEC120FF2 // MOVDDUP X4, X5
#define MOVDDUP_X6_X7 LONG $0xFE120FF2 // MOVDDUP X6, X7
#define MOVDDUP_X8_X9 LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9
#define ADDSUBPD_X2_X3 LONG $0xDAD00F66 // ADDSUBPD X2, X3
#define ADDSUBPD_X4_X5 LONG $0xECD00F66 // ADDSUBPD X4, X5
#define ADDSUBPD_X6_X7 LONG $0xFED00F66 // ADDSUBPD X6, X7
#define ADDSUBPD_X8_X9 LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
// func ScalInc(alpha complex128, x []complex128, n, inc uintptr)
TEXT ·ScalInc(SB), NOSPLIT, $0
MOVQ x_base+16(FP), SRC // SRC = &x
MOVQ n+40(FP), LEN // LEN = len(x)
CMPQ LEN, $0
JE scal_end // if LEN == 0 { return }
MOVQ inc+48(FP), INC // INC = inc
SHLQ $4, INC // INC = INC * sizeof(complex128)
LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC
MOVUPS alpha+0(FP), ALPHA // ALPHA = { imag(alpha), real(alpha) }
MOVAPS ALPHA, ALPHA_C
SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }
MOVAPS ALPHA, ALPHA2 // Copy ALPHA and ALPHA_C for pipelining
MOVAPS ALPHA_C, ALPHA_C2
MOVQ LEN, TAIL
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ scal_tail // if BX == 0 { goto scal_tail }
scal_loop: // do {
MOVUPS (SRC), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVUPS (SRC)(INC*1), X4
MOVUPS (SRC)(INC*2), X6
MOVUPS (SRC)(INC3*1), X8
// X_(i+1) = { real(x[i], real(x[i]) }
MOVDDUP_X2_X3
MOVDDUP_X4_X5
MOVDDUP_X6_X7
MOVDDUP_X8_X9
// X_i = { imag(x[i]), imag(x[i]) }
SHUFPD $0x3, X2, X2
SHUFPD $0x3, X4, X4
SHUFPD $0x3, X6, X6
SHUFPD $0x3, X8, X8
// X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) }
// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) }
MULPD ALPHA_C, X2
MULPD ALPHA, X3
MULPD ALPHA_C2, X4
MULPD ALPHA2, X5
MULPD ALPHA_C, X6
MULPD ALPHA, X7
MULPD ALPHA_C2, X8
MULPD ALPHA2, X9
// X_(i+1) = {
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
// }
ADDSUBPD_X2_X3
ADDSUBPD_X4_X5
ADDSUBPD_X6_X7
ADDSUBPD_X8_X9
MOVUPS X3, (DST) // x[i] = X_(i+1)
MOVUPS X5, (DST)(INC*1)
MOVUPS X7, (DST)(INC*2)
MOVUPS X9, (DST)(INC3*1)
LEAQ (SRC)(INC*4), SRC // SRC = &(SRC[inc*4])
DECQ LEN
JNZ scal_loop // } while --BX > 0
scal_tail:
ANDQ $3, TAIL // TAIL = TAIL % 4
JE scal_end // if TAIL == 0 { return }
scal_tail_loop: // do {
MOVUPS (SRC), X2 // X_i = { imag(x[i]), real(x[i]) }
MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) }
SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
MULPD ALPHA_C, X2 // X_i = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i]) }
MULPD ALPHA, X3 // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i]) }
// X_(i+1) = {
// imag(result[i]): imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
// real(result[i]): real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
// }
ADDSUBPD_X2_X3
MOVUPS X3, (DST) // x[i] = X_i
ADDQ INC, SRC // SRC = &(SRC[incX])
DECQ TAIL
JNZ scal_tail_loop // } while --TAIL > 0
scal_end:
RET

180
vendor/gonum.org/v1/gonum/internal/asm/c128/stubs.go generated vendored Normal file
View File

@@ -0,0 +1,180 @@
// Copyright ©2020 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package c128
import (
"math"
"math/cmplx"
)
// Add is
//
// for i, v := range s {
// dst[i] += v
// }
func Add(dst, s []complex128) {
for i, v := range s {
dst[i] += v
}
}
// AddConst is
//
// for i := range x {
// x[i] += alpha
// }
func AddConst(alpha complex128, x []complex128) {
for i := range x {
x[i] += alpha
}
}
// CumSum is
//
// if len(s) == 0 {
// return dst
// }
// dst[0] = s[0]
// for i, v := range s[1:] {
// dst[i+1] = dst[i] + v
// }
// return dst
func CumSum(dst, s []complex128) []complex128 {
if len(s) == 0 {
return dst
}
dst[0] = s[0]
for i, v := range s[1:] {
dst[i+1] = dst[i] + v
}
return dst
}
// CumProd is
//
// if len(s) == 0 {
// return dst
// }
// dst[0] = s[0]
// for i, v := range s[1:] {
// dst[i+1] = dst[i] * v
// }
// return dst
func CumProd(dst, s []complex128) []complex128 {
if len(s) == 0 {
return dst
}
dst[0] = s[0]
for i, v := range s[1:] {
dst[i+1] = dst[i] * v
}
return dst
}
// Div is
//
// for i, v := range s {
// dst[i] /= v
// }
func Div(dst, s []complex128) {
for i, v := range s {
dst[i] /= v
}
}
// DivTo is
//
// for i, v := range s {
// dst[i] = v / t[i]
// }
// return dst
func DivTo(dst, s, t []complex128) []complex128 {
for i, v := range s {
dst[i] = v / t[i]
}
return dst
}
// DotUnitary is
//
// for i, v := range x {
// sum += cmplx.Conj(v) * y[i]
// }
// return sum
func DotUnitary(x, y []complex128) (sum complex128) {
for i, v := range x {
sum += cmplx.Conj(v) * y[i]
}
return sum
}
// L2DistanceUnitary returns the L2-norm of x-y.
func L2DistanceUnitary(x, y []complex128) (norm float64) {
var scale float64
sumSquares := 1.0
for i, v := range x {
v -= y[i]
if v == 0 {
continue
}
absxi := cmplx.Abs(v)
if math.IsNaN(absxi) {
return math.NaN()
}
if scale < absxi {
s := scale / absxi
sumSquares = 1 + sumSquares*s*s
scale = absxi
} else {
s := absxi / scale
sumSquares += s * s
}
}
if math.IsInf(scale, 1) {
return math.Inf(1)
}
return scale * math.Sqrt(sumSquares)
}
// L2NormUnitary returns the L2-norm of x.
func L2NormUnitary(x []complex128) (norm float64) {
var scale float64
sumSquares := 1.0
for _, v := range x {
if v == 0 {
continue
}
absxi := cmplx.Abs(v)
if math.IsNaN(absxi) {
return math.NaN()
}
if scale < absxi {
s := scale / absxi
sumSquares = 1 + sumSquares*s*s
scale = absxi
} else {
s := absxi / scale
sumSquares += s * s
}
}
if math.IsInf(scale, 1) {
return math.Inf(1)
}
return scale * math.Sqrt(sumSquares)
}
// Sum is
//
// var sum complex128
// for i := range x {
// sum += x[i]
// }
func Sum(x []complex128) complex128 {
var sum complex128
for _, v := range x {
sum += v
}
return sum
}

View File

@@ -0,0 +1,109 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !noasm && !gccgo && !safe
// +build !noasm,!gccgo,!safe
package c128
// AxpyUnitary is
//
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha complex128, x, y []complex128)
// AxpyUnitaryTo is
//
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128)
// AxpyInc is
//
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
// AxpyIncTo is
//
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
// DscalUnitary is
//
// for i, v := range x {
// x[i] = complex(real(v)*alpha, imag(v)*alpha)
// }
func DscalUnitary(alpha float64, x []complex128)
// DscalInc is
//
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
// ix += inc
// }
func DscalInc(alpha float64, x []complex128, n, inc uintptr)
// ScalInc is
//
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] *= alpha
// ix += incX
// }
func ScalInc(alpha complex128, x []complex128, n, inc uintptr)
// ScalUnitary is
//
// for i := range x {
// x[i] *= alpha
// }
func ScalUnitary(alpha complex128, x []complex128)
// DotcUnitary is
//
// for i, v := range x {
// sum += y[i] * cmplx.Conj(v)
// }
// return sum
func DotcUnitary(x, y []complex128) (sum complex128)
// DotcInc is
//
// for i := 0; i < int(n); i++ {
// sum += y[iy] * cmplx.Conj(x[ix])
// ix += incX
// iy += incY
// }
// return sum
func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
// DotuUnitary is
//
// for i, v := range x {
// sum += y[i] * v
// }
// return sum
func DotuUnitary(x, y []complex128) (sum complex128)
// DotuInc is
//
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]
// ix += incX
// iy += incY
// }
// return sum
func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)

View File

@@ -0,0 +1,176 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || noasm || gccgo || safe
// +build !amd64 noasm gccgo safe
package c128
import "math/cmplx"
// AxpyUnitary is
//
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha complex128, x, y []complex128) {
for i, v := range x {
y[i] += alpha * v
}
}
// AxpyUnitaryTo is
//
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128) {
for i, v := range x {
dst[i] = alpha*v + y[i]
}
}
// AxpyInc is
//
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
y[iy] += alpha * x[ix]
ix += incX
iy += incY
}
}
// AxpyIncTo is
//
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
dst[idst] = alpha*x[ix] + y[iy]
ix += incX
iy += incY
idst += incDst
}
}
// DscalUnitary is
//
// for i, v := range x {
// x[i] = complex(real(v)*alpha, imag(v)*alpha)
// }
func DscalUnitary(alpha float64, x []complex128) {
for i, v := range x {
x[i] = complex(real(v)*alpha, imag(v)*alpha)
}
}
// DscalInc is
//
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
// ix += inc
// }
func DscalInc(alpha float64, x []complex128, n, inc uintptr) {
var ix uintptr
for i := 0; i < int(n); i++ {
x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
ix += inc
}
}
// ScalInc is
//
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] *= alpha
// ix += incX
// }
func ScalInc(alpha complex128, x []complex128, n, inc uintptr) {
var ix uintptr
for i := 0; i < int(n); i++ {
x[ix] *= alpha
ix += inc
}
}
// ScalUnitary is
//
// for i := range x {
// x[i] *= alpha
// }
func ScalUnitary(alpha complex128, x []complex128) {
for i := range x {
x[i] *= alpha
}
}
// DotcUnitary is
//
// for i, v := range x {
// sum += y[i] * cmplx.Conj(v)
// }
// return sum
func DotcUnitary(x, y []complex128) (sum complex128) {
for i, v := range x {
sum += y[i] * cmplx.Conj(v)
}
return sum
}
// DotcInc is
//
// for i := 0; i < int(n); i++ {
// sum += y[iy] * cmplx.Conj(x[ix])
// ix += incX
// iy += incY
// }
// return sum
func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
for i := 0; i < int(n); i++ {
sum += y[iy] * cmplx.Conj(x[ix])
ix += incX
iy += incY
}
return sum
}
// DotuUnitary is
//
// for i, v := range x {
// sum += y[i] * v
// }
// return sum
func DotuUnitary(x, y []complex128) (sum complex128) {
for i, v := range x {
sum += y[i] * v
}
return sum
}
// DotuInc is
//
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]
// ix += incX
// iy += incY
// }
// return sum
func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
for i := 0; i < int(n); i++ {
sum += y[iy] * x[ix]
ix += incX
iy += incY
}
return sum
}

View File

@@ -0,0 +1,151 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// MOVSHDUP X3, X2
#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
// MOVSLDUP X3, X3
#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
// ADDSUBPS X2, X3
#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// MOVSHDUP X5, X4
#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
// MOVSLDUP X5, X5
#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
// ADDSUBPS X4, X5
#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// MOVSHDUP X7, X6
#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
// MOVSLDUP X7, X7
#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
// ADDSUBPS X6, X7
#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// MOVSHDUP X9, X8
#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
// MOVSLDUP X9, X9
#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
// ADDSUBPS X8, X9
#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyInc(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SI // SI = &x
MOVQ y_base+32(FP), DI // DI = &y
MOVQ n+56(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JE axpyi_end
MOVQ ix+80(FP), R8 // R8 = ix
MOVQ iy+88(FP), R9 // R9 = iy
LEAQ (SI)(R8*8), SI // SI = &(x[ix])
LEAQ (DI)(R9*8), DI // DI = &(y[iy])
MOVQ DI, DX // DX = DI // Read/Write pointers
MOVQ incX+64(FP), R8 // R8 = incX
SHLQ $3, R8 // R8 *= sizeof(complex64)
MOVQ incY+72(FP), R9 // R9 = incY
SHLQ $3, R9 // R9 *= sizeof(complex64)
MOVSD alpha+0(FP), X0 // X0 = { 0, 0, imag(a), real(a) }
MOVAPS X0, X1
SHUFPS $0x11, X1, X1 // X1 = { 0, 0, real(a), imag(a) }
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX // CX = n % 4
SHRQ $2, BX // BX = floor( n / 4 )
JZ axpyi_tail // if BX == 0 { goto axpyi_tail }
axpyi_loop: // do {
MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) }
MOVSD (SI)(R8*1), X5
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVSD (SI), X7
MOVSD (SI)(R8*1), X9
// X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSHDUP_X3_X2
MOVSHDUP_X5_X4
MOVSHDUP_X7_X6
MOVSHDUP_X9_X8
// X_i = { real(x[i]), real(x[i]) }
MOVSLDUP_X3_X3
MOVSLDUP_X5_X5
MOVSLDUP_X7_X7
MOVSLDUP_X9_X9
// X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPS X1, X2
MULPS X0, X3
MULPS X11, X4
MULPS X10, X5
MULPS X1, X6
MULPS X0, X7
MULPS X11, X8
MULPS X10, X9
// X_i = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]),
// }
ADDSUBPS_X2_X3
ADDSUBPS_X4_X5
ADDSUBPS_X6_X7
ADDSUBPS_X8_X9
// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
MOVSD (DX), X2
MOVSD (DX)(R9*1), X4
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
MOVSD (DX), X6
MOVSD (DX)(R9*1), X8
ADDPS X2, X3
ADDPS X4, X5
ADDPS X6, X7
ADDPS X8, X9
MOVSD X3, (DI) // y[i] = X_i
MOVSD X5, (DI)(R9*1)
LEAQ (DI)(R9*2), DI // DI = &(DI[incDst])
MOVSD X7, (DI)
MOVSD X9, (DI)(R9*1)
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R9*2), DI // DI = &(DI[incDst])
DECQ BX
JNZ axpyi_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE axpyi_end
axpyi_tail: // do {
MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) }
MOVSHDUP_X3_X2 // X_(i-1) = { real(x[i]), real(x[i]) }
MOVSLDUP_X3_X3 // X_i = { imag(x[i]), imag(x[i]) }
// X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPS X1, X2
MULPS X0, X3
// X_i = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]),
// }
ADDSUBPS_X2_X3 // (ai*x1r+ar*x1i, ar*x1r-ai*x1i)
// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
MOVSD (DI), X4
ADDPS X4, X3
MOVSD X3, (DI) // y[i] = X_i
ADDQ R8, SI // SI += incX
ADDQ R9, DI // DI += incY
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -0,0 +1,156 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// MOVSHDUP X3, X2
#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
// MOVSLDUP X3, X3
#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
// ADDSUBPS X2, X3
#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// MOVSHDUP X5, X4
#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
// MOVSLDUP X5, X5
#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
// ADDSUBPS X4, X5
#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// MOVSHDUP X7, X6
#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
// MOVSLDUP X7, X7
#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
// ADDSUBPS X6, X7
#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// MOVSHDUP X9, X8
#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
// MOVSLDUP X9, X9
#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
// ADDSUBPS X8, X9
#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyIncTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ x_base+48(FP), SI // SI = &x
MOVQ y_base+72(FP), DX // DX = &y
MOVQ n+96(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JE axpyi_end
MOVQ ix+120(FP), R8 // Load the first index
MOVQ iy+128(FP), R9
MOVQ idst+32(FP), R10
LEAQ (SI)(R8*8), SI // SI = &(x[ix])
LEAQ (DX)(R9*8), DX // DX = &(y[iy])
LEAQ (DI)(R10*8), DI // DI = &(dst[idst])
MOVQ incX+104(FP), R8 // Incrementors*8 for easy iteration (ADDQ)
SHLQ $3, R8
MOVQ incY+112(FP), R9
SHLQ $3, R9
MOVQ incDst+24(FP), R10
SHLQ $3, R10
MOVSD alpha+40(FP), X0 // X0 = { 0, 0, imag(a), real(a) }
MOVAPS X0, X1
SHUFPS $0x11, X1, X1 // X1 = { 0, 0, real(a), imag(a) }
MOVAPS X0, X10 // Copy X0 and X1 for pipelining
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $3, CX // CX = n % 4
SHRQ $2, BX // BX = floor( n / 4 )
JZ axpyi_tail // if BX == 0 { goto axpyi_tail }
axpyi_loop: // do {
MOVSD (SI), X3 // X_i = { imag(x[i]), real(x[i]) }
MOVSD (SI)(R8*1), X5
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVSD (SI), X7
MOVSD (SI)(R8*1), X9
// X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSHDUP_X3_X2
MOVSHDUP_X5_X4
MOVSHDUP_X7_X6
MOVSHDUP_X9_X8
// X_i = { real(x[i]), real(x[i]) }
MOVSLDUP_X3_X3
MOVSLDUP_X5_X5
MOVSLDUP_X7_X7
MOVSLDUP_X9_X9
// X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
// X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) }
MULPS X1, X2
MULPS X0, X3
MULPS X11, X4
MULPS X10, X5
MULPS X1, X6
MULPS X0, X7
MULPS X11, X8
MULPS X10, X9
// X_i = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]),
// }
ADDSUBPS_X2_X3
ADDSUBPS_X4_X5
ADDSUBPS_X6_X7
ADDSUBPS_X8_X9
// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
MOVSD (DX), X2
MOVSD (DX)(R9*1), X4
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
MOVSD (DX), X6
MOVSD (DX)(R9*1), X8
ADDPS X2, X3
ADDPS X4, X5
ADDPS X6, X7
ADDPS X8, X9
MOVSD X3, (DI) // y[i] = X_i
MOVSD X5, (DI)(R10*1)
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst])
MOVSD X7, (DI)
MOVSD X9, (DI)(R10*1)
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst])
DECQ BX
JNZ axpyi_loop // } while --BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE axpyi_end
axpyi_tail:
MOVSD (SI), X3 // X_i = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X_i = { real(x[i]), real(x[i]) }
// X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPS X1, X2
MULPS X0, X3
// X_i = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]),
// }
ADDSUBPS_X2_X3
// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
MOVSD (DX), X4
ADDPS X4, X3
MOVSD X3, (DI) // y[i] = X_i
ADDQ R8, SI // SI += incX
ADDQ R9, DX // DX += incY
ADDQ R10, DI // DI += incDst
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -0,0 +1,160 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// MOVSHDUP X3, X2
#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
// MOVSLDUP X3, X3
#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
// ADDSUBPS X2, X3
#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// MOVSHDUP X5, X4
#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
// MOVSLDUP X5, X5
#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
// ADDSUBPS X4, X5
#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// MOVSHDUP X7, X6
#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
// MOVSLDUP X7, X7
#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
// ADDSUBPS X6, X7
#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// MOVSHDUP X9, X8
#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
// MOVSLDUP X9, X9
#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
// ADDSUBPS X8, X9
#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyUnitary(alpha complex64, x, y []complex64)
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SI // SI = &x
MOVQ y_base+32(FP), DI // DI = &y
MOVQ x_len+16(FP), CX // CX = min( len(x), len(y) )
CMPQ y_len+40(FP), CX
CMOVQLE y_len+40(FP), CX
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
PXOR X0, X0 // Clear work registers and cache-align loop
PXOR X1, X1
MOVSD alpha+0(FP), X0 // X0 = { 0, 0, imag(a), real(a) }
SHUFPD $0, X0, X0 // X0 = { imag(a), real(a), imag(a), real(a) }
MOVAPS X0, X1
SHUFPS $0x11, X1, X1 // X1 = { real(a), imag(a), real(a), imag(a) }
XORQ AX, AX // i = 0
MOVQ DI, BX // Align on 16-byte boundary for ADDPS
ANDQ $15, BX // BX = &y & 15
JZ caxy_no_trim // if BX == 0 { goto caxy_no_trim }
// Trim first value in unaligned buffer
XORPS X2, X2 // Clear work registers and cache-align loop
XORPS X3, X3
XORPS X4, X4
MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) }
MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
ADDSUBPS_X2_X3
MOVSD (DI)(AX*8), X4 // X3 += y[i]
ADDPS X4, X3
MOVSD X3, (DI)(AX*8) // y[i] = X3
INCQ AX // i++
DECQ CX // --CX
JZ caxy_end // if CX == 0 { return }
caxy_no_trim:
MOVAPS X0, X10 // Copy X0 and X1 for pipelineing
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $7, CX // CX = n % 8
SHRQ $3, BX // BX = floor( n / 8 )
JZ caxy_tail // if BX == 0 { goto caxy_tail }
caxy_loop: // do {
// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
MOVUPS (SI)(AX*8), X3
MOVUPS 16(SI)(AX*8), X5
MOVUPS 32(SI)(AX*8), X7
MOVUPS 48(SI)(AX*8), X9
// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
MOVSHDUP_X3_X2
MOVSHDUP_X5_X4
MOVSHDUP_X7_X6
MOVSHDUP_X9_X8
// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
MOVSLDUP_X3_X3
MOVSLDUP_X5_X5
MOVSLDUP_X7_X7
MOVSLDUP_X9_X9
// X_i = { imag(a) * real(x[i]), real(a) * real(x[i]),
// imag(a) * real(x[i+1]), real(a) * real(x[i+1]) }
// X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]),
// real(a) * imag(x[i+1]), imag(a) * imag(x[i+1]) }
MULPS X1, X2
MULPS X0, X3
MULPS X11, X4
MULPS X10, X5
MULPS X1, X6
MULPS X0, X7
MULPS X11, X8
MULPS X10, X9
// X_i = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]),
// imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
// real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
// }
ADDSUBPS_X2_X3
ADDSUBPS_X4_X5
ADDSUBPS_X6_X7
ADDSUBPS_X8_X9
// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]),
// imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1]) }
ADDPS (DI)(AX*8), X3
ADDPS 16(DI)(AX*8), X5
ADDPS 32(DI)(AX*8), X7
ADDPS 48(DI)(AX*8), X9
MOVUPS X3, (DI)(AX*8) // y[i:i+1] = X_i
MOVUPS X5, 16(DI)(AX*8)
MOVUPS X7, 32(DI)(AX*8)
MOVUPS X9, 48(DI)(AX*8)
ADDQ $8, AX // i += 8
DECQ BX // --BX
JNZ caxy_loop // } while BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
caxy_tail: // do {
MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) }
MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(a)*real(x[i]) - imag(a)*imag(x[i]) }
ADDSUBPS_X2_X3
MOVSD (DI)(AX*8), X4 // X3 += y[i]
ADDPS X4, X3
MOVSD X3, (DI)(AX*8) // y[i] = X3
INCQ AX // ++i
LOOP caxy_tail // } while --CX > 0
caxy_end:
RET

View File

@@ -0,0 +1,157 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// MOVSHDUP X3, X2
#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
// MOVSLDUP X3, X3
#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
// ADDSUBPS X2, X3
#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
// MOVSHDUP X5, X4
#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
// MOVSLDUP X5, X5
#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
// ADDSUBPS X4, X5
#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
// MOVSHDUP X7, X6
#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
// MOVSLDUP X7, X7
#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
// ADDSUBPS X6, X7
#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
// MOVSHDUP X9, X8
#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
// MOVSLDUP X9, X9
#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
// ADDSUBPS X8, X9
#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
// func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ x_base+32(FP), SI // SI = &x
MOVQ y_base+56(FP), DX // DX = &y
MOVQ x_len+40(FP), CX
CMPQ y_len+64(FP), CX // CX = min( len(x), len(y), len(dst) )
CMOVQLE y_len+64(FP), CX
CMPQ dst_len+8(FP), CX
CMOVQLE dst_len+8(FP), CX
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
MOVSD alpha+24(FP), X0 // X0 = { 0, 0, imag(a), real(a) }
SHUFPD $0, X0, X0 // X0 = { imag(a), real(a), imag(a), real(a) }
MOVAPS X0, X1
SHUFPS $0x11, X1, X1 // X1 = { real(a), imag(a), real(a), imag(a) }
XORQ AX, AX // i = 0
MOVQ DX, BX // Align on 16-byte boundary for ADDPS
ANDQ $15, BX // BX = &y & 15
JZ caxy_no_trim // if BX == 0 { goto caxy_no_trim }
MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) }
MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
ADDSUBPS_X2_X3
MOVSD (DX)(AX*8), X4 // X3 += y[i]
ADDPS X4, X3
MOVSD X3, (DI)(AX*8) // dst[i] = X3
INCQ AX // i++
DECQ CX // --CX
JZ caxy_tail // if BX == 0 { goto caxy_tail }
caxy_no_trim:
MOVAPS X0, X10 // Copy X0 and X1 for pipelineing
MOVAPS X1, X11
MOVQ CX, BX
ANDQ $7, CX // CX = n % 8
SHRQ $3, BX // BX = floor( n / 8 )
JZ caxy_tail // if BX == 0 { goto caxy_tail }
caxy_loop:
// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
MOVUPS (SI)(AX*8), X3
MOVUPS 16(SI)(AX*8), X5
MOVUPS 32(SI)(AX*8), X7
MOVUPS 48(SI)(AX*8), X9
// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
MOVSHDUP_X3_X2
MOVSHDUP_X5_X4
MOVSHDUP_X7_X6
MOVSHDUP_X9_X8
// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
MOVSLDUP_X3_X3
MOVSLDUP_X5_X5
MOVSLDUP_X7_X7
MOVSLDUP_X9_X9
// X_i = { imag(a) * real(x[i]), real(a) * real(x[i]),
// imag(a) * real(x[i+1]), real(a) * real(x[i+1]) }
// X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]),
// real(a) * imag(x[i+1]), imag(a) * imag(x[i+1]) }
MULPS X1, X2
MULPS X0, X3
MULPS X11, X4
MULPS X10, X5
MULPS X1, X6
MULPS X0, X7
MULPS X11, X8
MULPS X10, X9
// X_i = {
// imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]),
// imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
// real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
// }
ADDSUBPS_X2_X3
ADDSUBPS_X4_X5
ADDSUBPS_X6_X7
ADDSUBPS_X8_X9
// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]),
// imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1]) }
ADDPS (DX)(AX*8), X3
ADDPS 16(DX)(AX*8), X5
ADDPS 32(DX)(AX*8), X7
ADDPS 48(DX)(AX*8), X9
MOVUPS X3, (DI)(AX*8) // y[i:i+1] = X_i
MOVUPS X5, 16(DI)(AX*8)
MOVUPS X7, 32(DI)(AX*8)
MOVUPS X9, 48(DI)(AX*8)
ADDQ $8, AX // i += 8
DECQ BX // --BX
JNZ caxy_loop // } while BX > 0
CMPQ CX, $0 // if CX == 0 { return }
JE caxy_end
caxy_tail: // do {
MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) }
MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
// real(a)*real(x[i]) - imag(a)*imag(x[i]) }
ADDSUBPS_X2_X3
MOVSD (DX)(AX*8), X4 // X3 += y[i]
ADDPS X4, X3
MOVSD X3, (DI)(AX*8) // y[i] = X3
INCQ AX // ++i
LOOP caxy_tail // } while --CX > 0
caxy_end:
RET

7
vendor/gonum.org/v1/gonum/internal/asm/c64/conj.go generated vendored Normal file
View File

@@ -0,0 +1,7 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package c64
func conj(c complex64) complex64 { return complex(real(c), -imag(c)) }

6
vendor/gonum.org/v1/gonum/internal/asm/c64/doc.go generated vendored Normal file
View File

@@ -0,0 +1,6 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package c64 provides complex64 vector primitives.
package c64 // import "gonum.org/v1/gonum/internal/asm/c64"

View File

@@ -0,0 +1,160 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define MOVSHDUP_X3_X2 LONG $0xD3160FF3 // MOVSHDUP X3, X2
#define MOVSHDUP_X5_X4 LONG $0xE5160FF3 // MOVSHDUP X5, X4
#define MOVSHDUP_X7_X6 LONG $0xF7160FF3 // MOVSHDUP X7, X6
#define MOVSHDUP_X9_X8 LONG $0x160F45F3; BYTE $0xC1 // MOVSHDUP X9, X8
#define MOVSLDUP_X3_X3 LONG $0xDB120FF3 // MOVSLDUP X3, X3
#define MOVSLDUP_X5_X5 LONG $0xED120FF3 // MOVSLDUP X5, X5
#define MOVSLDUP_X7_X7 LONG $0xFF120FF3 // MOVSLDUP X7, X7
#define MOVSLDUP_X9_X9 LONG $0x120F45F3; BYTE $0xC9 // MOVSLDUP X9, X9
#define ADDSUBPS_X2_X3 LONG $0xDAD00FF2 // ADDSUBPS X2, X3
#define ADDSUBPS_X4_X5 LONG $0xECD00FF2 // ADDSUBPS X4, X5
#define ADDSUBPS_X6_X7 LONG $0xFED00FF2 // ADDSUBPS X6, X7
#define ADDSUBPS_X8_X9 LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define INC_X R8
#define INCx3_X R9
#define INC_Y R10
#define INCx3_Y R11
#define NEG1 X15
#define P_NEG1 X14
// func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
TEXT ·DotcInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
PXOR SUM, SUM // SUM = 0
PXOR P_SUM, P_SUM // P_SUM = 0
MOVQ n+48(FP), LEN // LEN = n
CMPQ LEN, $0 // if LEN == 0 { return }
JE dotc_end
MOVQ ix+72(FP), INC_X
MOVQ iy+80(FP), INC_Y
LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[ix])
LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(Y_PTR[iy])
MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(complex64)
SHLQ $3, INC_X
MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(complex64)
SHLQ $3, INC_Y
MOVSS $(-1.0), NEG1
SHUFPS $0, NEG1, NEG1 // { -1, -1, -1, -1 }
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = LEN % 4
SHRQ $2, LEN // LEN = floor( LEN / 4 )
JZ dotc_tail // if LEN == 0 { goto dotc_tail }
MOVUPS NEG1, P_NEG1 // Copy NEG1 for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
dotc_loop: // do {
MOVSD (X_PTR), X3 // X_i = { imag(x[i]), real(x[i]) }
MOVSD (X_PTR)(INC_X*1), X5
MOVSD (X_PTR)(INC_X*2), X7
MOVSD (X_PTR)(INCx3_X*1), X9
// X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSHDUP_X3_X2
MOVSHDUP_X5_X4
MOVSHDUP_X7_X6
MOVSHDUP_X9_X8
// X_i = { real(x[i]), real(x[i]) }
MOVSLDUP_X3_X3
MOVSLDUP_X5_X5
MOVSLDUP_X7_X7
MOVSLDUP_X9_X9
// X_(i-1) = { -imag(x[i]), -imag(x[i]) }
MULPS NEG1, X2
MULPS P_NEG1, X4
MULPS NEG1, X6
MULPS P_NEG1, X8
// X_j = { imag(y[i]), real(y[i]) }
MOVSD (Y_PTR), X10
MOVSD (Y_PTR)(INC_Y*1), X11
MOVSD (Y_PTR)(INC_Y*2), X12
MOVSD (Y_PTR)(INCx3_Y*1), X13
// X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
MULPS X10, X3
MULPS X11, X5
MULPS X12, X7
MULPS X13, X9
// X_j = { real(y[i]), imag(y[i]) }
SHUFPS $0xB1, X10, X10
SHUFPS $0xB1, X11, X11
SHUFPS $0xB1, X12, X12
SHUFPS $0xB1, X13, X13
// X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
MULPS X10, X2
MULPS X11, X4
MULPS X12, X6
MULPS X13, X8
// X_i = {
// imag(result[i]): imag(y[i]) * real(x[i]) + real(y[i]) * imag(x[i]),
// real(result[i]): real(y[i]) * real(x[i]) - imag(y[i]) * imag(x[i]) }
ADDSUBPS_X2_X3
ADDSUBPS_X4_X5
ADDSUBPS_X6_X7
ADDSUBPS_X8_X9
// SUM += X_i
ADDPS X3, SUM
ADDPS X5, P_SUM
ADDPS X7, SUM
ADDPS X9, P_SUM
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X*4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y*4])
DECQ LEN
JNZ dotc_loop // } while --LEN > 0
ADDPS P_SUM, SUM // SUM = { P_SUM + SUM }
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dotc_end
dotc_tail: // do {
MOVSD (X_PTR), X3 // X_i = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X_i = { real(x[i]), real(x[i]) }
MULPS NEG1, X2 // X_(i-1) = { -imag(x[i]), imag(x[i]) }
MOVUPS (Y_PTR), X10 // X_j = { imag(y[i]), real(y[i]) }
MULPS X10, X3 // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
MULPS X10, X2 // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
// X_i = {
// imag(result[i]): imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
// real(result[i]): real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
ADDSUBPS_X2_X3
ADDPS X3, SUM // SUM += X_i
ADDQ INC_X, X_PTR // X_PTR += INC_X
ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y
DECQ TAIL
JNZ dotc_tail // } while --TAIL > 0
dotc_end:
MOVSD SUM, sum+88(FP) // return SUM
RET

View File

@@ -0,0 +1,208 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define MOVSLDUP_XPTR_IDX_8__X3 LONG $0x1C120FF3; BYTE $0xC6 // MOVSLDUP (SI)(AX*8), X3
#define MOVSLDUP_16_XPTR_IDX_8__X5 LONG $0x6C120FF3; WORD $0x10C6 // MOVSLDUP 16(SI)(AX*8), X5
#define MOVSLDUP_32_XPTR_IDX_8__X7 LONG $0x7C120FF3; WORD $0x20C6 // MOVSLDUP 32(SI)(AX*8), X7
#define MOVSLDUP_48_XPTR_IDX_8__X9 LONG $0x120F44F3; WORD $0xC64C; BYTE $0x30 // MOVSLDUP 48(SI)(AX*8), X9
#define MOVSHDUP_XPTR_IDX_8__X2 LONG $0x14160FF3; BYTE $0xC6 // MOVSHDUP (SI)(AX*8), X2
#define MOVSHDUP_16_XPTR_IDX_8__X4 LONG $0x64160FF3; WORD $0x10C6 // MOVSHDUP 16(SI)(AX*8), X4
#define MOVSHDUP_32_XPTR_IDX_8__X6 LONG $0x74160FF3; WORD $0x20C6 // MOVSHDUP 32(SI)(AX*8), X6
#define MOVSHDUP_48_XPTR_IDX_8__X8 LONG $0x160F44F3; WORD $0xC644; BYTE $0x30 // MOVSHDUP 48(SI)(AX*8), X8
#define MOVSHDUP_X3_X2 LONG $0xD3160FF3 // MOVSHDUP X3, X2
#define MOVSLDUP_X3_X3 LONG $0xDB120FF3 // MOVSLDUP X3, X3
#define ADDSUBPS_X2_X3 LONG $0xDAD00FF2 // ADDSUBPS X2, X3
#define ADDSUBPS_X4_X5 LONG $0xECD00FF2 // ADDSUBPS X4, X5
#define ADDSUBPS_X6_X7 LONG $0xFED00FF2 // ADDSUBPS X6, X7
#define ADDSUBPS_X8_X9 LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define IDX AX
#define I_IDX DX
#define NEG1 X15
#define P_NEG1 X14
// func DotcUnitary(x, y []complex64) (sum complex64)
TEXT ·DotcUnitary(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
PXOR SUM, SUM // SUM = 0
PXOR P_SUM, P_SUM // P_SUM = 0
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+32(FP), LEN
CMOVQLE y_len+32(FP), LEN
CMPQ LEN, $0 // if LEN == 0 { return }
JE dotc_end
XORQ IDX, IDX // i = 0
MOVSS $(-1.0), NEG1
SHUFPS $0, NEG1, NEG1 // { -1, -1, -1, -1 }
MOVQ X_PTR, DX
ANDQ $15, DX // DX = &x & 15
JZ dotc_aligned // if DX == 0 { goto dotc_aligned }
MOVSD (X_PTR)(IDX*8), X3 // X_i = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X_i = { real(x[i]), real(x[i]) }
MOVSD (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
MULPS NEG1, X2 // X_(i-1) = { -imag(x[i]), imag(x[i]) }
MULPS X10, X3 // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
MULPS X10, X2 // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
// X_i = {
// imag(result[i]): imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
// real(result[i]): real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
ADDSUBPS_X2_X3
MOVAPS X3, SUM // SUM = X_i
INCQ IDX // IDX++
DECQ LEN // LEN--
JZ dotc_ret // if LEN == 0 { goto dotc_ret }
dotc_aligned:
MOVQ LEN, TAIL
ANDQ $7, TAIL // TAIL = LEN % 8
SHRQ $3, LEN // LEN = floor( LEN / 8 )
JZ dotc_tail // if LEN == 0 { return }
MOVUPS NEG1, P_NEG1 // Copy NEG1 for pipelining
dotc_loop: // do {
MOVSLDUP_XPTR_IDX_8__X3 // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
MOVSLDUP_16_XPTR_IDX_8__X5
MOVSLDUP_32_XPTR_IDX_8__X7
MOVSLDUP_48_XPTR_IDX_8__X9
MOVSHDUP_XPTR_IDX_8__X2 // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i+1]), imag(x[i+1]) }
MOVSHDUP_16_XPTR_IDX_8__X4
MOVSHDUP_32_XPTR_IDX_8__X6
MOVSHDUP_48_XPTR_IDX_8__X8
// X_j = { imag(y[i]), real(y[i]), imag(y[i+1]), real(y[i+1]) }
MOVUPS (Y_PTR)(IDX*8), X10
MOVUPS 16(Y_PTR)(IDX*8), X11
MOVUPS 32(Y_PTR)(IDX*8), X12
MOVUPS 48(Y_PTR)(IDX*8), X13
// X_(i-1) = { -imag(x[i]), -imag(x[i]), -imag(x[i]+1), -imag(x[i]+1) }
MULPS NEG1, X2
MULPS P_NEG1, X4
MULPS NEG1, X6
MULPS P_NEG1, X8
// X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]),
// imag(y[i+1]) * real(x[i+1]), real(y[i+1]) * real(x[i+1]) }
MULPS X10, X3
MULPS X11, X5
MULPS X12, X7
MULPS X13, X9
// X_j = { real(y[i]), imag(y[i]), real(y[i+1]), imag(y[i+1]) }
SHUFPS $0xB1, X10, X10
SHUFPS $0xB1, X11, X11
SHUFPS $0xB1, X12, X12
SHUFPS $0xB1, X13, X13
// X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]),
// real(y[i+1]) * imag(x[i+1]), imag(y[i+1]) * imag(x[i+1]) }
MULPS X10, X2
MULPS X11, X4
MULPS X12, X6
MULPS X13, X8
// X_i = {
// imag(result[i]): imag(y[i]) * real(x[i]) + real(y[i]) * imag(x[i]),
// real(result[i]): real(y[i]) * real(x[i]) - imag(y[i]) * imag(x[i]),
// imag(result[i+1]): imag(y[i+1]) * real(x[i+1]) + real(y[i+1]) * imag(x[i+1]),
// real(result[i+1]): real(y[i+1]) * real(x[i+1]) - imag(y[i+1]) * imag(x[i+1]),
// }
ADDSUBPS_X2_X3
ADDSUBPS_X4_X5
ADDSUBPS_X6_X7
ADDSUBPS_X8_X9
// SUM += X_i
ADDPS X3, SUM
ADDPS X5, P_SUM
ADDPS X7, SUM
ADDPS X9, P_SUM
ADDQ $8, IDX // IDX += 8
DECQ LEN
JNZ dotc_loop // } while --LEN > 0
ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
XORPS SUM, SUM // SUM = 0
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dotc_end
dotc_tail:
MOVQ TAIL, LEN
SHRQ $1, LEN // LEN = floor( LEN / 2 )
JZ dotc_tail_one // if LEN == 0 { goto dotc_tail_one }
dotc_tail_two: // do {
MOVSLDUP_XPTR_IDX_8__X3 // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
MOVSHDUP_XPTR_IDX_8__X2 // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
MULPS NEG1, X2 // X_(i-1) = { -imag(x[i]), imag(x[i]) }
MULPS X10, X3 // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
SHUFPS $0xB1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
MULPS X10, X2 // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
// X_i = {
// imag(result[i]): imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
// real(result[i]): real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
ADDSUBPS_X2_X3
ADDPS X3, SUM // SUM += X_i
ADDQ $2, IDX // IDX += 2
DECQ LEN
JNZ dotc_tail_two // } while --LEN > 0
ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
XORPS SUM, SUM // SUM = 0
ANDQ $1, TAIL
JZ dotc_end
dotc_tail_one:
MOVSD (X_PTR)(IDX*8), X3 // X_i = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X_i = { real(x[i]), real(x[i]) }
MOVSD (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
MULPS NEG1, X2 // X_(i-1) = { -imag(x[i]), imag(x[i]) }
MULPS X10, X3 // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
MULPS X10, X2 // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
// X_i = {
// imag(result[i]): imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
// real(result[i]): real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
ADDSUBPS_X2_X3
ADDPS X3, SUM // SUM += X_i
dotc_end:
ADDPS P_SUM, SUM // SUM = { P_SUM[0] + SUM[0] }
MOVHLPS P_SUM, P_SUM // P_SUM = { P_SUM[1], P_SUM[1] }
ADDPS P_SUM, SUM // SUM = { P_SUM[1] + SUM[0] }
dotc_ret:
MOVSD SUM, sum+48(FP) // return SUM
RET

View File

@@ -0,0 +1,148 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define MOVSHDUP_X3_X2 LONG $0xD3160FF3 // MOVSHDUP X3, X2
#define MOVSHDUP_X5_X4 LONG $0xE5160FF3 // MOVSHDUP X5, X4
#define MOVSHDUP_X7_X6 LONG $0xF7160FF3 // MOVSHDUP X7, X6
#define MOVSHDUP_X9_X8 LONG $0x160F45F3; BYTE $0xC1 // MOVSHDUP X9, X8
#define MOVSLDUP_X3_X3 LONG $0xDB120FF3 // MOVSLDUP X3, X3
#define MOVSLDUP_X5_X5 LONG $0xED120FF3 // MOVSLDUP X5, X5
#define MOVSLDUP_X7_X7 LONG $0xFF120FF3 // MOVSLDUP X7, X7
#define MOVSLDUP_X9_X9 LONG $0x120F45F3; BYTE $0xC9 // MOVSLDUP X9, X9
#define ADDSUBPS_X2_X3 LONG $0xDAD00FF2 // ADDSUBPS X2, X3
#define ADDSUBPS_X4_X5 LONG $0xECD00FF2 // ADDSUBPS X4, X5
#define ADDSUBPS_X6_X7 LONG $0xFED00FF2 // ADDSUBPS X6, X7
#define ADDSUBPS_X8_X9 LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define INC_X R8
#define INCx3_X R9
#define INC_Y R10
#define INCx3_Y R11
// func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
TEXT ·DotuInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
PXOR SUM, SUM // SUM = 0
PXOR P_SUM, P_SUM // P_SUM = 0
MOVQ n+48(FP), LEN // LEN = n
CMPQ LEN, $0 // if LEN == 0 { return }
JE dotu_end
MOVQ ix+72(FP), INC_X
MOVQ iy+80(FP), INC_Y
LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[ix])
LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(Y_PTR[iy])
MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(complex64)
SHLQ $3, INC_X
MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(complex64)
SHLQ $3, INC_Y
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = LEN % 4
SHRQ $2, LEN // LEN = floor( LEN / 4 )
JZ dotu_tail // if TAIL == 0 { goto dotu_tail }
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
dotu_loop: // do {
MOVSD (X_PTR), X3 // X_i = { imag(x[i]), real(x[i]) }
MOVSD (X_PTR)(INC_X*1), X5
MOVSD (X_PTR)(INC_X*2), X7
MOVSD (X_PTR)(INCx3_X*1), X9
// X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSHDUP_X3_X2
MOVSHDUP_X5_X4
MOVSHDUP_X7_X6
MOVSHDUP_X9_X8
// X_i = { real(x[i]), real(x[i]) }
MOVSLDUP_X3_X3
MOVSLDUP_X5_X5
MOVSLDUP_X7_X7
MOVSLDUP_X9_X9
// X_j = { imag(y[i]), real(y[i]) }
MOVSD (Y_PTR), X10
MOVSD (Y_PTR)(INC_Y*1), X11
MOVSD (Y_PTR)(INC_Y*2), X12
MOVSD (Y_PTR)(INCx3_Y*1), X13
// X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
MULPS X10, X3
MULPS X11, X5
MULPS X12, X7
MULPS X13, X9
// X_j = { real(y[i]), imag(y[i]) }
SHUFPS $0xB1, X10, X10
SHUFPS $0xB1, X11, X11
SHUFPS $0xB1, X12, X12
SHUFPS $0xB1, X13, X13
// X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
MULPS X10, X2
MULPS X11, X4
MULPS X12, X6
MULPS X13, X8
// X_i = {
// imag(result[i]): imag(y[i]) * real(x[i]) + real(y[i]) * imag(x[i]),
// real(result[i]): real(y[i]) * real(x[i]) - imag(y[i]) * imag(x[i]) }
ADDSUBPS_X2_X3
ADDSUBPS_X4_X5
ADDSUBPS_X6_X7
ADDSUBPS_X8_X9
// SUM += X_i
ADDPS X3, SUM
ADDPS X5, P_SUM
ADDPS X7, SUM
ADDPS X9, P_SUM
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X*4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y*4])
DECQ LEN
JNZ dotu_loop // } while --LEN > 0
ADDPS P_SUM, SUM // SUM = { P_SUM + SUM }
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dotu_end
dotu_tail: // do {
MOVSD (X_PTR), X3 // X_i = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X_i = { real(x[i]), real(x[i]) }
MOVUPS (Y_PTR), X10 // X_j = { imag(y[i]), real(y[i]) }
MULPS X10, X3 // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
MULPS X10, X2 // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
// X_i = {
// imag(result[i]): imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
// real(result[i]): real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
ADDSUBPS_X2_X3
ADDPS X3, SUM // SUM += X_i
ADDQ INC_X, X_PTR // X_PTR += INC_X
ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y
DECQ TAIL
JNZ dotu_tail // } while --TAIL > 0
dotu_end:
MOVSD SUM, sum+88(FP) // return SUM
RET

View File

@@ -0,0 +1,197 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define MOVSLDUP_XPTR_IDX_8__X3 LONG $0x1C120FF3; BYTE $0xC6 // MOVSLDUP (SI)(AX*8), X3
#define MOVSLDUP_16_XPTR_IDX_8__X5 LONG $0x6C120FF3; WORD $0x10C6 // MOVSLDUP 16(SI)(AX*8), X5
#define MOVSLDUP_32_XPTR_IDX_8__X7 LONG $0x7C120FF3; WORD $0x20C6 // MOVSLDUP 32(SI)(AX*8), X7
#define MOVSLDUP_48_XPTR_IDX_8__X9 LONG $0x120F44F3; WORD $0xC64C; BYTE $0x30 // MOVSLDUP 48(SI)(AX*8), X9
#define MOVSHDUP_XPTR_IDX_8__X2 LONG $0x14160FF3; BYTE $0xC6 // MOVSHDUP (SI)(AX*8), X2
#define MOVSHDUP_16_XPTR_IDX_8__X4 LONG $0x64160FF3; WORD $0x10C6 // MOVSHDUP 16(SI)(AX*8), X4
#define MOVSHDUP_32_XPTR_IDX_8__X6 LONG $0x74160FF3; WORD $0x20C6 // MOVSHDUP 32(SI)(AX*8), X6
#define MOVSHDUP_48_XPTR_IDX_8__X8 LONG $0x160F44F3; WORD $0xC644; BYTE $0x30 // MOVSHDUP 48(SI)(AX*8), X8
#define MOVSHDUP_X3_X2 LONG $0xD3160FF3 // MOVSHDUP X3, X2
#define MOVSLDUP_X3_X3 LONG $0xDB120FF3 // MOVSLDUP X3, X3
#define ADDSUBPS_X2_X3 LONG $0xDAD00FF2 // ADDSUBPS X2, X3
#define ADDSUBPS_X4_X5 LONG $0xECD00FF2 // ADDSUBPS X4, X5
#define ADDSUBPS_X6_X7 LONG $0xFED00FF2 // ADDSUBPS X6, X7
#define ADDSUBPS_X8_X9 LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define SUM X0
#define P_SUM X1
#define IDX AX
#define I_IDX DX
#define NEG1 X15
#define P_NEG1 X14
// func DotuUnitary(x, y []complex64) (sum complex64)
TEXT ·DotuUnitary(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
PXOR SUM, SUM // SUM = 0
PXOR P_SUM, P_SUM // P_SUM = 0
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+32(FP), LEN
CMOVQLE y_len+32(FP), LEN
CMPQ LEN, $0 // if LEN == 0 { return }
JE dotu_end
XORQ IDX, IDX // IDX = 0
MOVQ X_PTR, DX
ANDQ $15, DX // DX = &x & 15
JZ dotu_aligned // if DX == 0 { goto dotu_aligned }
MOVSD (X_PTR)(IDX*8), X3 // X_i = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X_i = { real(x[i]), real(x[i]) }
MOVSD (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
MULPS X10, X3 // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
MULPS X10, X2 // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
// X_i = {
// imag(result[i]): imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
// real(result[i]): real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
ADDSUBPS_X2_X3
MOVAPS X3, SUM // SUM = X_i
INCQ IDX // IDX++
DECQ LEN // LEN--
JZ dotu_end // if LEN == 0 { goto dotu_end }
dotu_aligned:
MOVQ LEN, TAIL
ANDQ $7, TAIL // TAIL = LEN % 8
SHRQ $3, LEN // LEN = floor( LEN / 8 )
JZ dotu_tail // if LEN == 0 { goto dotu_tail }
PXOR P_SUM, P_SUM
dotu_loop: // do {
MOVSLDUP_XPTR_IDX_8__X3 // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
MOVSLDUP_16_XPTR_IDX_8__X5
MOVSLDUP_32_XPTR_IDX_8__X7
MOVSLDUP_48_XPTR_IDX_8__X9
MOVSHDUP_XPTR_IDX_8__X2 // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
MOVSHDUP_16_XPTR_IDX_8__X4
MOVSHDUP_32_XPTR_IDX_8__X6
MOVSHDUP_48_XPTR_IDX_8__X8
// X_j = { imag(y[i]), real(y[i]), imag(y[i+1]), real(y[i+1]) }
MOVUPS (Y_PTR)(IDX*8), X10
MOVUPS 16(Y_PTR)(IDX*8), X11
MOVUPS 32(Y_PTR)(IDX*8), X12
MOVUPS 48(Y_PTR)(IDX*8), X13
// X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]),
// imag(y[i+1]) * real(x[i+1]), real(y[i+1]) * real(x[i+1]) }
MULPS X10, X3
MULPS X11, X5
MULPS X12, X7
MULPS X13, X9
// X_j = { real(y[i]), imag(y[i]), real(y[i+1]), imag(y[i+1]) }
SHUFPS $0xB1, X10, X10
SHUFPS $0xB1, X11, X11
SHUFPS $0xB1, X12, X12
SHUFPS $0xB1, X13, X13
// X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]),
// real(y[i+1]) * imag(x[i+1]), imag(y[i+1]) * imag(x[i+1]) }
MULPS X10, X2
MULPS X11, X4
MULPS X12, X6
MULPS X13, X8
// X_i = {
// imag(result[i]): imag(y[i]) * real(x[i]) + real(y[i]) * imag(x[i]),
// real(result[i]): real(y[i]) * real(x[i]) - imag(y[i]) * imag(x[i]),
// imag(result[i+1]): imag(y[i+1]) * real(x[i+1]) + real(y[i+1]) * imag(x[i+1]),
// real(result[i+1]): real(y[i+1]) * real(x[i+1]) - imag(y[i+1]) * imag(x[i+1]),
// }
ADDSUBPS_X2_X3
ADDSUBPS_X4_X5
ADDSUBPS_X6_X7
ADDSUBPS_X8_X9
// SUM += X_i
ADDPS X3, SUM
ADDPS X5, P_SUM
ADDPS X7, SUM
ADDPS X9, P_SUM
ADDQ $8, IDX // IDX += 8
DECQ LEN
JNZ dotu_loop // } while --LEN > 0
ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
XORPS SUM, SUM // SUM = 0
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dotu_end
dotu_tail:
MOVQ TAIL, LEN
SHRQ $1, LEN // LEN = floor( LEN / 2 )
JZ dotu_tail_one // if LEN == 0 { goto dotc_tail_one }
dotu_tail_two: // do {
MOVSLDUP_XPTR_IDX_8__X3 // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
MOVSHDUP_XPTR_IDX_8__X2 // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
MULPS X10, X3 // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
SHUFPS $0xB1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
MULPS X10, X2 // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
// X_i = {
// imag(result[i]): imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
// real(result[i]): real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
ADDSUBPS_X2_X3
ADDPS X3, SUM // SUM += X_i
ADDQ $2, IDX // IDX += 2
DECQ LEN
JNZ dotu_tail_two // } while --LEN > 0
ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
XORPS SUM, SUM // SUM = 0
ANDQ $1, TAIL
JZ dotu_end
dotu_tail_one:
MOVSD (X_PTR)(IDX*8), X3 // X_i = { imag(x[i]), real(x[i]) }
MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
MOVSLDUP_X3_X3 // X_i = { real(x[i]), real(x[i]) }
MOVSD (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
MULPS X10, X3 // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
MULPS X10, X2 // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
// X_i = {
// imag(result[i]): imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
// real(result[i]): real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
ADDSUBPS_X2_X3
ADDPS X3, SUM // SUM += X_i
dotu_end:
ADDPS P_SUM, SUM // SUM = { P_SUM[0] + SUM[0] }
MOVHLPS P_SUM, P_SUM // P_SUM = { P_SUM[1], P_SUM[1] }
ADDPS P_SUM, SUM // SUM = { P_SUM[1] + SUM[0] }
dotu_ret:
MOVSD SUM, sum+48(FP) // return SUM
RET

85
vendor/gonum.org/v1/gonum/internal/asm/c64/scal.go generated vendored Normal file
View File

@@ -0,0 +1,85 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package c64
// ScalUnitary is
//
// for i := range x {
// x[i] *= alpha
// }
func ScalUnitary(alpha complex64, x []complex64) {
for i := range x {
x[i] *= alpha
}
}
// ScalUnitaryTo is
//
// for i, v := range x {
// dst[i] = alpha * v
// }
func ScalUnitaryTo(dst []complex64, alpha complex64, x []complex64) {
for i, v := range x {
dst[i] = alpha * v
}
}
// ScalInc is
//
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] *= alpha
// ix += incX
// }
func ScalInc(alpha complex64, x []complex64, n, incX uintptr) {
var ix uintptr
for i := 0; i < int(n); i++ {
x[ix] *= alpha
ix += incX
}
}
// ScalIncTo is
//
// var idst, ix uintptr
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha * x[ix]
// ix += incX
// idst += incDst
// }
func ScalIncTo(dst []complex64, incDst uintptr, alpha complex64, x []complex64, n, incX uintptr) {
var idst, ix uintptr
for i := 0; i < int(n); i++ {
dst[idst] = alpha * x[ix]
ix += incX
idst += incDst
}
}
// SscalUnitary is
//
// for i, v := range x {
// x[i] = complex(real(v)*alpha, imag(v)*alpha)
// }
func SscalUnitary(alpha float32, x []complex64) {
for i, v := range x {
x[i] = complex(real(v)*alpha, imag(v)*alpha)
}
}
// SscalInc is
//
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
// ix += inc
// }
func SscalInc(alpha float32, x []complex64, n, inc uintptr) {
var ix uintptr
for i := 0; i < int(n); i++ {
x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
ix += inc
}
}

180
vendor/gonum.org/v1/gonum/internal/asm/c64/stubs.go generated vendored Normal file
View File

@@ -0,0 +1,180 @@
// Copyright ©2020 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package c64
import (
"gonum.org/v1/gonum/internal/cmplx64"
"gonum.org/v1/gonum/internal/math32"
)
// Add is
//
// for i, v := range s {
// dst[i] += v
// }
func Add(dst, s []complex64) {
for i, v := range s {
dst[i] += v
}
}
// AddConst is
//
// for i := range x {
// x[i] += alpha
// }
func AddConst(alpha complex64, x []complex64) {
for i := range x {
x[i] += alpha
}
}
// CumSum is
//
// if len(s) == 0 {
// return dst
// }
// dst[0] = s[0]
// for i, v := range s[1:] {
// dst[i+1] = dst[i] + v
// }
// return dst
func CumSum(dst, s []complex64) []complex64 {
if len(s) == 0 {
return dst
}
dst[0] = s[0]
for i, v := range s[1:] {
dst[i+1] = dst[i] + v
}
return dst
}
// CumProd is
//
// if len(s) == 0 {
// return dst
// }
// dst[0] = s[0]
// for i, v := range s[1:] {
// dst[i+1] = dst[i] * v
// }
// return dst
func CumProd(dst, s []complex64) []complex64 {
if len(s) == 0 {
return dst
}
dst[0] = s[0]
for i, v := range s[1:] {
dst[i+1] = dst[i] * v
}
return dst
}
// Div is
//
// for i, v := range s {
// dst[i] /= v
// }
func Div(dst, s []complex64) {
for i, v := range s {
dst[i] /= v
}
}
// DivTo is
//
// for i, v := range s {
// dst[i] = v / t[i]
// }
// return dst
func DivTo(dst, s, t []complex64) []complex64 {
for i, v := range s {
dst[i] = v / t[i]
}
return dst
}
// DotUnitary is
//
// for i, v := range x {
// sum += conj(v) * y[i]
// }
// return sum
func DotUnitary(x, y []complex64) (sum complex64) {
for i, v := range x {
sum += cmplx64.Conj(v) * y[i]
}
return sum
}
// L2DistanceUnitary returns the L2-norm of x-y.
func L2DistanceUnitary(x, y []complex64) (norm float32) {
var scale float32
sumSquares := float32(1.0)
for i, v := range x {
v -= y[i]
if v == 0 {
continue
}
absxi := cmplx64.Abs(v)
if math32.IsNaN(absxi) {
return math32.NaN()
}
if scale < absxi {
s := scale / absxi
sumSquares = 1 + sumSquares*s*s
scale = absxi
} else {
s := absxi / scale
sumSquares += s * s
}
}
if math32.IsInf(scale, 1) {
return math32.Inf(1)
}
return scale * math32.Sqrt(sumSquares)
}
// L2NormUnitary returns the L2-norm of x.
func L2NormUnitary(x []complex64) (norm float32) {
var scale float32
sumSquares := float32(1.0)
for _, v := range x {
if v == 0 {
continue
}
absxi := cmplx64.Abs(v)
if math32.IsNaN(absxi) {
return math32.NaN()
}
if scale < absxi {
s := scale / absxi
sumSquares = 1 + sumSquares*s*s
scale = absxi
} else {
s := absxi / scale
sumSquares += s * s
}
}
if math32.IsInf(scale, 1) {
return math32.Inf(1)
}
return scale * math32.Sqrt(sumSquares)
}
// Sum is
//
// var sum complex64
// for i := range x {
// sum += x[i]
// }
func Sum(x []complex64) complex64 {
var sum complex64
for _, v := range x {
sum += v
}
return sum
}

View File

@@ -0,0 +1,77 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !noasm && !gccgo && !safe
// +build !noasm,!gccgo,!safe
package c64
// AxpyUnitary is
//
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha complex64, x, y []complex64)
// AxpyUnitaryTo is
//
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
// AxpyInc is
//
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
// AxpyIncTo is
//
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
// DotcUnitary is
//
// for i, v := range x {
// sum += y[i] * conj(v)
// }
// return sum
func DotcUnitary(x, y []complex64) (sum complex64)
// DotcInc is
//
// for i := 0; i < int(n); i++ {
// sum += y[iy] * conj(x[ix])
// ix += incX
// iy += incY
// }
// return sum
func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
// DotuUnitary is
//
// for i, v := range x {
// sum += y[i] * v
// }
// return sum
func DotuUnitary(x, y []complex64) (sum complex64)
// DotuInc is
//
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]
// ix += incX
// iy += incY
// }
// return sum
func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)

View File

@@ -0,0 +1,122 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || noasm || gccgo || safe
// +build !amd64 noasm gccgo safe
package c64
// AxpyUnitary is
//
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha complex64, x, y []complex64) {
for i, v := range x {
y[i] += alpha * v
}
}
// AxpyUnitaryTo is
//
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64) {
for i, v := range x {
dst[i] = alpha*v + y[i]
}
}
// AxpyInc is
//
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
y[iy] += alpha * x[ix]
ix += incX
iy += incY
}
}
// AxpyIncTo is
//
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
dst[idst] = alpha*x[ix] + y[iy]
ix += incX
iy += incY
idst += incDst
}
}
// DotcUnitary is
//
// for i, v := range x {
// sum += y[i] * conj(v)
// }
// return sum
func DotcUnitary(x, y []complex64) (sum complex64) {
for i, v := range x {
sum += y[i] * conj(v)
}
return sum
}
// DotcInc is
//
// for i := 0; i < int(n); i++ {
// sum += y[iy] * conj(x[ix])
// ix += incX
// iy += incY
// }
// return sum
func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) {
for i := 0; i < int(n); i++ {
sum += y[iy] * conj(x[ix])
ix += incX
iy += incY
}
return sum
}
// DotuUnitary is
//
// for i, v := range x {
// sum += y[i] * v
// }
// return sum
func DotuUnitary(x, y []complex64) (sum complex64) {
for i, v := range x {
sum += y[i] * v
}
return sum
}
// DotuInc is
//
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]
// ix += incX
// iy += incY
// }
// return sum
func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) {
for i := 0; i < int(n); i++ {
sum += y[iy] * x[ix]
ix += incX
iy += incY
}
return sum
}

View File

@@ -0,0 +1,73 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyInc(SB), NOSPLIT, $0
MOVQ n+56(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JLE axpyi_end
MOVQ x_base+8(FP), SI // SI = &x
MOVQ y_base+32(FP), DI // DI = &y
MOVQ ix+80(FP), R8 // R8 = ix
MOVQ iy+88(FP), R9 // R9 = iy
LEAQ (SI)(R8*4), SI // SI = &(x[ix])
LEAQ (DI)(R9*4), DI // DI = &(y[iy])
MOVQ DI, DX // DX = DI Read Pointer for y
MOVQ incX+64(FP), R8 // R8 = incX
SHLQ $2, R8 // R8 *= sizeof(float32)
MOVQ incY+72(FP), R9 // R9 = incY
SHLQ $2, R9 // R9 *= sizeof(float32)
MOVSS alpha+0(FP), X0 // X0 = alpha
MOVSS X0, X1 // X1 = X0 // for pipelining
MOVQ CX, BX
ANDQ $3, BX // BX = n % 4
SHRQ $2, CX // CX = floor( n / 4 )
JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start }
axpyi_loop: // Loop unrolled 4x do {
MOVSS (SI), X2 // X_i = x[i]
MOVSS (SI)(R8*1), X3
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVSS (SI), X4
MOVSS (SI)(R8*1), X5
MULSS X1, X2 // X_i *= a
MULSS X0, X3
MULSS X1, X4
MULSS X0, X5
ADDSS (DX), X2 // X_i += y[i]
ADDSS (DX)(R9*1), X3
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
ADDSS (DX), X4
ADDSS (DX)(R9*1), X5
MOVSS X2, (DI) // y[i] = X_i
MOVSS X3, (DI)(R9*1)
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
MOVSS X4, (DI)
MOVSS X5, (DI)(R9*1)
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2])
LOOP axpyi_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE axpyi_end
axpyi_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
axpyi_tail: // do {
MOVSS (SI), X2 // X2 = x[i]
MULSS X1, X2 // X2 *= a
ADDSS (DI), X2 // X2 += y[i]
MOVSS X2, (DI) // y[i] = X2
ADDQ R8, SI // SI = &(SI[incX])
ADDQ R9, DI // DI = &(DI[incY])
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -0,0 +1,78 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyIncTo(SB), NOSPLIT, $0
MOVQ n+96(FP), CX // CX = n
CMPQ CX, $0 // if n==0 { return }
JLE axpyi_end
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ x_base+48(FP), SI // SI = &x
MOVQ y_base+72(FP), DX // DX = &y
MOVQ ix+120(FP), R8 // R8 = ix // Load the first index
MOVQ iy+128(FP), R9 // R9 = iy
MOVQ idst+32(FP), R10 // R10 = idst
LEAQ (SI)(R8*4), SI // SI = &(x[ix])
LEAQ (DX)(R9*4), DX // DX = &(y[iy])
LEAQ (DI)(R10*4), DI // DI = &(dst[idst])
MOVQ incX+104(FP), R8 // R8 = incX
SHLQ $2, R8 // R8 *= sizeof(float32)
MOVQ incY+112(FP), R9 // R9 = incY
SHLQ $2, R9 // R9 *= sizeof(float32)
MOVQ incDst+24(FP), R10 // R10 = incDst
SHLQ $2, R10 // R10 *= sizeof(float32)
MOVSS alpha+40(FP), X0 // X0 = alpha
MOVSS X0, X1 // X1 = X0 // for pipelining
MOVQ CX, BX
ANDQ $3, BX // BX = n % 4
SHRQ $2, CX // CX = floor( n / 4 )
JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start }
axpyi_loop: // Loop unrolled 4x do {
MOVSS (SI), X2 // X_i = x[i]
MOVSS (SI)(R8*1), X3
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2])
MOVSS (SI), X4
MOVSS (SI)(R8*1), X5
MULSS X1, X2 // X_i *= a
MULSS X0, X3
MULSS X1, X4
MULSS X0, X5
ADDSS (DX), X2 // X_i += y[i]
ADDSS (DX)(R9*1), X3
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
ADDSS (DX), X4
ADDSS (DX)(R9*1), X5
MOVSS X2, (DI) // dst[i] = X_i
MOVSS X3, (DI)(R10*1)
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2])
MOVSS X4, (DI)
MOVSS X5, (DI)(R10*1)
LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses
LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2])
LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2])
LOOP axpyi_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE axpyi_end
axpyi_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
axpyi_tail: // do {
MOVSS (SI), X2 // X2 = x[i]
MULSS X1, X2 // X2 *= a
ADDSS (DX), X2 // X2 += y[i]
MOVSS X2, (DI) // dst[i] = X2
ADDQ R8, SI // SI = &(SI[incX])
ADDQ R9, DX // DX = &(DX[incY])
ADDQ R10, DI // DI = &(DI[incY])
LOOP axpyi_tail // } while --CX > 0
axpyi_end:
RET

View File

@@ -0,0 +1,97 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// func AxpyUnitary(alpha float32, x, y []float32)
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SI // SI = &x
MOVQ y_base+32(FP), DI // DI = &y
MOVQ x_len+16(FP), BX // BX = min( len(x), len(y) )
CMPQ y_len+40(FP), BX
CMOVQLE y_len+40(FP), BX
CMPQ BX, $0 // if BX == 0 { return }
JE axpy_end
MOVSS alpha+0(FP), X0
SHUFPS $0, X0, X0 // X0 = { a, a, a, a }
XORQ AX, AX // i = 0
PXOR X2, X2 // 2 NOP instructions (PXOR) to align
PXOR X3, X3 // loop to cache line
MOVQ DI, CX
ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS
JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim }
XORQ $0xF, CX // CX = 4 - floor( BX % 16 / 4 )
INCQ CX
SHRQ $2, CX
axpy_align: // Trim first value(s) in unaligned buffer do {
MOVSS (SI)(AX*4), X2 // X2 = x[i]
MULSS X0, X2 // X2 *= a
ADDSS (DI)(AX*4), X2 // X2 += y[i]
MOVSS X2, (DI)(AX*4) // y[i] = X2
INCQ AX // i++
DECQ BX
JZ axpy_end // if --BX == 0 { return }
LOOP axpy_align // } while --CX > 0
axpy_no_trim:
MOVUPS X0, X1 // Copy X0 to X1 for pipelining
MOVQ BX, CX
ANDQ $0xF, BX // BX = len % 16
SHRQ $4, CX // CX = int( len / 16 )
JZ axpy_tail4_start // if CX == 0 { return }
axpy_loop: // Loop unrolled 16x do {
MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4]
MOVUPS 16(SI)(AX*4), X3
MOVUPS 32(SI)(AX*4), X4
MOVUPS 48(SI)(AX*4), X5
MULPS X0, X2 // X2 *= a
MULPS X1, X3
MULPS X0, X4
MULPS X1, X5
ADDPS (DI)(AX*4), X2 // X2 += y[i:i+4]
ADDPS 16(DI)(AX*4), X3
ADDPS 32(DI)(AX*4), X4
ADDPS 48(DI)(AX*4), X5
MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2
MOVUPS X3, 16(DI)(AX*4)
MOVUPS X4, 32(DI)(AX*4)
MOVUPS X5, 48(DI)(AX*4)
ADDQ $16, AX // i += 16
LOOP axpy_loop // while (--CX) > 0
CMPQ BX, $0 // if BX == 0 { return }
JE axpy_end
axpy_tail4_start: // Reset loop counter for 4-wide tail loop
MOVQ BX, CX // CX = floor( BX / 4 )
SHRQ $2, CX
JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start }
axpy_tail4: // Loop unrolled 4x do {
MOVUPS (SI)(AX*4), X2 // X2 = x[i]
MULPS X0, X2 // X2 *= a
ADDPS (DI)(AX*4), X2 // X2 += y[i]
MOVUPS X2, (DI)(AX*4) // y[i] = X2
ADDQ $4, AX // i += 4
LOOP axpy_tail4 // } while --CX > 0
axpy_tail_start: // Reset loop counter for 1-wide tail loop
MOVQ BX, CX // CX = BX % 4
ANDQ $3, CX
JZ axpy_end // if CX == 0 { return }
axpy_tail:
MOVSS (SI)(AX*4), X1 // X1 = x[i]
MULSS X0, X1 // X1 *= a
ADDSS (DI)(AX*4), X1 // X1 += y[i]
MOVSS X1, (DI)(AX*4) // y[i] = X1
INCQ AX // i++
LOOP axpy_tail // } while --CX > 0
axpy_end:
RET

View File

@@ -0,0 +1,98 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ x_base+32(FP), SI // SI = &x
MOVQ y_base+56(FP), DX // DX = &y
MOVQ x_len+40(FP), BX // BX = min( len(x), len(y), len(dst) )
CMPQ y_len+64(FP), BX
CMOVQLE y_len+64(FP), BX
CMPQ dst_len+8(FP), BX
CMOVQLE dst_len+8(FP), BX
CMPQ BX, $0 // if BX == 0 { return }
JE axpy_end
MOVSS alpha+24(FP), X0
SHUFPS $0, X0, X0 // X0 = { a, a, a, a, }
XORQ AX, AX // i = 0
MOVQ DX, CX
ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS
JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim }
XORQ $0xF, CX // CX = 4 - floor ( B % 16 / 4 )
INCQ CX
SHRQ $2, CX
axpy_align: // Trim first value(s) in unaligned buffer do {
MOVSS (SI)(AX*4), X2 // X2 = x[i]
MULSS X0, X2 // X2 *= a
ADDSS (DX)(AX*4), X2 // X2 += y[i]
MOVSS X2, (DI)(AX*4) // y[i] = X2
INCQ AX // i++
DECQ BX
JZ axpy_end // if --BX == 0 { return }
LOOP axpy_align // } while --CX > 0
axpy_no_trim:
MOVUPS X0, X1 // Copy X0 to X1 for pipelining
MOVQ BX, CX
ANDQ $0xF, BX // BX = len % 16
SHRQ $4, CX // CX = floor( len / 16 )
JZ axpy_tail4_start // if CX == 0 { return }
axpy_loop: // Loop unrolled 16x do {
MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4]
MOVUPS 16(SI)(AX*4), X3
MOVUPS 32(SI)(AX*4), X4
MOVUPS 48(SI)(AX*4), X5
MULPS X0, X2 // X2 *= a
MULPS X1, X3
MULPS X0, X4
MULPS X1, X5
ADDPS (DX)(AX*4), X2 // X2 += y[i:i+4]
ADDPS 16(DX)(AX*4), X3
ADDPS 32(DX)(AX*4), X4
ADDPS 48(DX)(AX*4), X5
MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2
MOVUPS X3, 16(DI)(AX*4)
MOVUPS X4, 32(DI)(AX*4)
MOVUPS X5, 48(DI)(AX*4)
ADDQ $16, AX // i += 16
LOOP axpy_loop // while (--CX) > 0
CMPQ BX, $0 // if BX == 0 { return }
JE axpy_end
axpy_tail4_start: // Reset loop counter for 4-wide tail loop
MOVQ BX, CX // CX = floor( BX / 4 )
SHRQ $2, CX
JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start }
axpy_tail4: // Loop unrolled 4x do {
MOVUPS (SI)(AX*4), X2 // X2 = x[i]
MULPS X0, X2 // X2 *= a
ADDPS (DX)(AX*4), X2 // X2 += y[i]
MOVUPS X2, (DI)(AX*4) // y[i] = X2
ADDQ $4, AX // i += 4
LOOP axpy_tail4 // } while --CX > 0
axpy_tail_start: // Reset loop counter for 1-wide tail loop
MOVQ BX, CX // CX = BX % 4
ANDQ $3, CX
JZ axpy_end // if CX == 0 { return }
axpy_tail:
MOVSS (SI)(AX*4), X1 // X1 = x[i]
MULSS X0, X1 // X1 *= a
ADDSS (DX)(AX*4), X1 // X1 += y[i]
MOVSS X1, (DI)(AX*4) // y[i] = X1
INCQ AX // i++
LOOP axpy_tail // } while --CX > 0
axpy_end:
RET

View File

@@ -0,0 +1,91 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define INC_X R8
#define INCx3_X R10
#define INC_Y R9
#define INCx3_Y R11
#define SUM X0
#define P_SUM X1
// func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
TEXT ·DdotInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ n+48(FP), LEN // LEN = n
PXOR SUM, SUM // SUM = 0
CMPQ LEN, $0
JE dot_end
MOVQ ix+72(FP), INC_X // INC_X = ix
MOVQ iy+80(FP), INC_Y // INC_Y = iy
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(float32)
SHLQ $2, INC_X
MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(float32)
SHLQ $2, INC_Y
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = LEN % 4
SHRQ $2, LEN // LEN = floor( LEN / 4 )
JZ dot_tail // if LEN == 0 { goto dot_tail }
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
dot_loop: // Loop unrolled 4x do {
CVTSS2SD (X_PTR), X2 // X_i = x[i:i+1]
CVTSS2SD (X_PTR)(INC_X*1), X3
CVTSS2SD (X_PTR)(INC_X*2), X4
CVTSS2SD (X_PTR)(INCx3_X*1), X5
CVTSS2SD (Y_PTR), X6 // X_j = y[i:i+1]
CVTSS2SD (Y_PTR)(INC_Y*1), X7
CVTSS2SD (Y_PTR)(INC_Y*2), X8
CVTSS2SD (Y_PTR)(INCx3_Y*1), X9
MULSD X6, X2 // X_i *= X_j
MULSD X7, X3
MULSD X8, X4
MULSD X9, X5
ADDSD X2, SUM // SUM += X_i
ADDSD X3, P_SUM
ADDSD X4, SUM
ADDSD X5, P_SUM
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDSD P_SUM, SUM // SUM += P_SUM
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
CVTSS2SD (X_PTR), X2 // X2 = x[i]
CVTSS2SD (Y_PTR), X3 // X2 *= y[i]
MULSD X3, X2
ADDSD X2, SUM // SUM += X2
ADDQ INC_X, X_PTR // X_PTR += INC_X
ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y
DECQ TAIL
JNZ dot_tail // } while --TAIL > 0
dot_end:
MOVSD SUM, sum+88(FP) // return SUM
RET

View File

@@ -0,0 +1,110 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define HADDPD_SUM_SUM LONG $0xC07C0F66 // @ HADDPD X0, X0
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define IDX AX
#define SUM X0
#define P_SUM X1
// func DdotUnitary(x, y []float32) (sum float32)
TEXT ·DdotUnitary(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+32(FP), LEN
CMOVQLE y_len+32(FP), LEN
PXOR SUM, SUM // psum = 0
CMPQ LEN, $0
JE dot_end
XORQ IDX, IDX
MOVQ Y_PTR, DX
ANDQ $0xF, DX // Align on 16-byte boundary for ADDPS
JZ dot_no_trim // if DX == 0 { goto dot_no_trim }
SUBQ $16, DX
dot_align: // Trim first value(s) in unaligned buffer do {
CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
MULSD X3, X2
ADDSD X2, SUM // SUM += X2
INCQ IDX // IDX++
DECQ LEN
JZ dot_end // if --TAIL == 0 { return }
ADDQ $4, DX
JNZ dot_align // } while --LEN > 0
dot_no_trim:
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
MOVQ LEN, TAIL
ANDQ $0x7, TAIL // TAIL = LEN % 8
SHRQ $3, LEN // LEN = floor( LEN / 8 )
JZ dot_tail_start // if LEN == 0 { goto dot_tail_start }
dot_loop: // Loop unrolled 8x do {
CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
CVTPS2PD 8(X_PTR)(IDX*4), X3
CVTPS2PD 16(X_PTR)(IDX*4), X4
CVTPS2PD 24(X_PTR)(IDX*4), X5
CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
CVTPS2PD 8(Y_PTR)(IDX*4), X7
CVTPS2PD 16(Y_PTR)(IDX*4), X8
CVTPS2PD 24(Y_PTR)(IDX*4), X9
MULPD X6, X2 // X_i *= X_j
MULPD X7, X3
MULPD X8, X4
MULPD X9, X5
ADDPD X2, SUM // SUM += X_i
ADDPD X3, P_SUM
ADDPD X4, SUM
ADDPD X5, P_SUM
ADDQ $8, IDX // IDX += 8
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDPD P_SUM, SUM // SUM += P_SUM
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail_start:
MOVQ TAIL, LEN
SHRQ $1, LEN
JZ dot_tail_one
dot_tail_two:
CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
MULPD X6, X2 // X_i *= X_j
ADDPD X2, SUM // SUM += X_i
ADDQ $2, IDX // IDX += 2
DECQ LEN
JNZ dot_tail_two // } while --LEN > 0
ANDQ $1, TAIL
JZ dot_end
dot_tail_one:
CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
MULSD X3, X2 // X2 *= X3
ADDSD X2, SUM // SUM += X2
dot_end:
HADDPD_SUM_SUM // SUM = \sum{ SUM[i] }
MOVSD SUM, sum+48(FP) // return SUM
RET

6
vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go generated vendored Normal file
View File

@@ -0,0 +1,6 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package f32 provides float32 vector primitives.
package f32 // import "gonum.org/v1/gonum/internal/asm/f32"

View File

@@ -0,0 +1,85 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define INC_X R8
#define INCx3_X R10
#define INC_Y R9
#define INCx3_Y R11
#define SUM X0
#define P_SUM X1
// func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
TEXT ·DotInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
PXOR SUM, SUM // SUM = 0
MOVQ n+48(FP), LEN // LEN = n
CMPQ LEN, $0
JE dot_end
MOVQ ix+72(FP), INC_X // INC_X = ix
MOVQ iy+80(FP), INC_Y // INC_Y = iy
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
MOVQ incX+56(FP), INC_X // INC_X := incX * sizeof(float32)
SHLQ $2, INC_X
MOVQ incY+64(FP), INC_Y // INC_Y := incY * sizeof(float32)
SHLQ $2, INC_Y
MOVQ LEN, TAIL
ANDQ $0x3, TAIL // TAIL = LEN % 4
SHRQ $2, LEN // LEN = floor( LEN / 4 )
JZ dot_tail // if LEN == 0 { goto dot_tail }
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
dot_loop: // Loop unrolled 4x do {
MOVSS (X_PTR), X2 // X_i = x[i:i+1]
MOVSS (X_PTR)(INC_X*1), X3
MOVSS (X_PTR)(INC_X*2), X4
MOVSS (X_PTR)(INCx3_X*1), X5
MULSS (Y_PTR), X2 // X_i *= y[i:i+1]
MULSS (Y_PTR)(INC_Y*1), X3
MULSS (Y_PTR)(INC_Y*2), X4
MULSS (Y_PTR)(INCx3_Y*1), X5
ADDSS X2, SUM // SUM += X_i
ADDSS X3, P_SUM
ADDSS X4, SUM
ADDSS X5, P_SUM
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDSS P_SUM, SUM // P_SUM += SUM
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail: // do {
MOVSS (X_PTR), X2 // X2 = x[i]
MULSS (Y_PTR), X2 // X2 *= y[i]
ADDSS X2, SUM // SUM += X2
ADDQ INC_X, X_PTR // X_PTR += INC_X
ADDQ INC_Y, Y_PTR // Y_PTR += INC_Y
DECQ TAIL
JNZ dot_tail // } while --TAIL > 0
dot_end:
MOVSS SUM, sum+88(FP) // return SUM
RET

View File

@@ -0,0 +1,106 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define HADDPS_SUM_SUM LONG $0xC07C0FF2 // @ HADDPS X0, X0
#define X_PTR SI
#define Y_PTR DI
#define LEN CX
#define TAIL BX
#define IDX AX
#define SUM X0
#define P_SUM X1
// func DotUnitary(x, y []float32) (sum float32)
TEXT ·DotUnitary(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
PXOR SUM, SUM // SUM = 0
MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+32(FP), LEN
CMOVQLE y_len+32(FP), LEN
CMPQ LEN, $0
JE dot_end
XORQ IDX, IDX
MOVQ Y_PTR, DX
ANDQ $0xF, DX // Align on 16-byte boundary for MULPS
JZ dot_no_trim // if DX == 0 { goto dot_no_trim }
SUBQ $16, DX
dot_align: // Trim first value(s) in unaligned buffer do {
MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
ADDSS X2, SUM // SUM += X2
INCQ IDX // IDX++
DECQ LEN
JZ dot_end // if --TAIL == 0 { return }
ADDQ $4, DX
JNZ dot_align // } while --DX > 0
dot_no_trim:
PXOR P_SUM, P_SUM // P_SUM = 0 for pipelining
MOVQ LEN, TAIL
ANDQ $0xF, TAIL // TAIL = LEN % 16
SHRQ $4, LEN // LEN = floor( LEN / 16 )
JZ dot_tail4_start // if LEN == 0 { goto dot_tail4_start }
dot_loop: // Loop unrolled 16x do {
MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
MOVUPS 16(X_PTR)(IDX*4), X3
MOVUPS 32(X_PTR)(IDX*4), X4
MOVUPS 48(X_PTR)(IDX*4), X5
MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
MULPS 16(Y_PTR)(IDX*4), X3
MULPS 32(Y_PTR)(IDX*4), X4
MULPS 48(Y_PTR)(IDX*4), X5
ADDPS X2, SUM // SUM += X_i
ADDPS X3, P_SUM
ADDPS X4, SUM
ADDPS X5, P_SUM
ADDQ $16, IDX // IDX += 16
DECQ LEN
JNZ dot_loop // } while --LEN > 0
ADDPS P_SUM, SUM // SUM += P_SUM
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE dot_end
dot_tail4_start: // Reset loop counter for 4-wide tail loop
MOVQ TAIL, LEN // LEN = floor( TAIL / 4 )
SHRQ $2, LEN
JZ dot_tail_start // if LEN == 0 { goto dot_tail_start }
dot_tail4_loop: // Loop unrolled 4x do {
MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
MULPS (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
ADDPS X2, SUM // SUM += X_i
ADDQ $4, IDX // i += 4
DECQ LEN
JNZ dot_tail4_loop // } while --LEN > 0
dot_tail_start: // Reset loop counter for 1-wide tail loop
ANDQ $3, TAIL // TAIL = TAIL % 4
JZ dot_end // if TAIL == 0 { return }
dot_tail: // do {
MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
ADDSS X2, SUM // psum += X2
INCQ IDX // IDX++
DECQ TAIL
JNZ dot_tail // } while --TAIL > 0
dot_end:
HADDPS_SUM_SUM // SUM = \sum{ SUM[i] }
HADDPS_SUM_SUM
MOVSS SUM, sum+48(FP) // return SUM
RET

18
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go generated vendored Normal file
View File

@@ -0,0 +1,18 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !noasm && !gccgo && !safe
// +build !noasm,!gccgo,!safe
package f32
// Ger performs the rank-one operation
//
// A += alpha * x * yᵀ
//
// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
func Ger(m, n uintptr, alpha float32,
x []float32, incX uintptr,
y []float32, incY uintptr,
a []float32, lda uintptr)

757
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,757 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define SIZE 4
#define BITSIZE 2
#define KERNELSIZE 3
#define M_DIM m+0(FP)
#define M CX
#define N_DIM n+8(FP)
#define N BX
#define TMP1 R14
#define TMP2 R15
#define X_PTR SI
#define Y y_base+56(FP)
#define Y_PTR DX
#define A_ROW AX
#define A_PTR DI
#define INC_X R8
#define INC3_X R9
#define INC_Y R10
#define INC3_Y R11
#define LDA R12
#define LDA3 R13
#define ALPHA X0
#define ALPHA_SPILL al-16(SP)
#define LOAD_ALPHA \
MOVSS alpha+16(FP), ALPHA \
SHUFPS $0, ALPHA, ALPHA
#define LOAD_SCALED4 \
PREFETCHNTA 16*SIZE(X_PTR) \
MOVDDUP (X_PTR), X1 \
MOVDDUP 2*SIZE(X_PTR), X3 \
MOVSHDUP X1, X2 \
MOVSHDUP X3, X4 \
MOVSLDUP X1, X1 \
MOVSLDUP X3, X3 \
MULPS ALPHA, X1 \
MULPS ALPHA, X2 \
MULPS ALPHA, X3 \
MULPS ALPHA, X4
#define LOAD_SCALED2 \
MOVDDUP (X_PTR), X1 \
MOVSHDUP X1, X2 \
MOVSLDUP X1, X1 \
MULPS ALPHA, X1 \
MULPS ALPHA, X2
#define LOAD_SCALED1 \
MOVSS (X_PTR), X1 \
SHUFPS $0, X1, X1 \
MULPS ALPHA, X1
#define LOAD_SCALED4_INC \
PREFETCHNTA (X_PTR)(INC_X*8) \
MOVSS (X_PTR), X1 \
MOVSS (X_PTR)(INC_X*1), X2 \
MOVSS (X_PTR)(INC_X*2), X3 \
MOVSS (X_PTR)(INC3_X*1), X4 \
SHUFPS $0, X1, X1 \
SHUFPS $0, X2, X2 \
SHUFPS $0, X3, X3 \
SHUFPS $0, X4, X4 \
MULPS ALPHA, X1 \
MULPS ALPHA, X2 \
MULPS ALPHA, X3 \
MULPS ALPHA, X4
#define LOAD_SCALED2_INC \
MOVSS (X_PTR), X1 \
MOVSS (X_PTR)(INC_X*1), X2 \
SHUFPS $0, X1, X1 \
SHUFPS $0, X2, X2 \
MULPS ALPHA, X1 \
MULPS ALPHA, X2
#define KERNEL_LOAD8 \
MOVUPS (Y_PTR), X5 \
MOVUPS 4*SIZE(Y_PTR), X6
#define KERNEL_LOAD8_INC \
MOVSS (Y_PTR), X5 \
MOVSS (Y_PTR)(INC_Y*1), X6 \
MOVSS (Y_PTR)(INC_Y*2), X7 \
MOVSS (Y_PTR)(INC3_Y*1), X8 \
UNPCKLPS X6, X5 \
UNPCKLPS X8, X7 \
MOVLHPS X7, X5 \
LEAQ (Y_PTR)(INC_Y*4), Y_PTR \
MOVSS (Y_PTR), X6 \
MOVSS (Y_PTR)(INC_Y*1), X7 \
MOVSS (Y_PTR)(INC_Y*2), X8 \
MOVSS (Y_PTR)(INC3_Y*1), X9 \
UNPCKLPS X7, X6 \
UNPCKLPS X9, X8 \
MOVLHPS X8, X6
#define KERNEL_LOAD4 \
MOVUPS (Y_PTR), X5
#define KERNEL_LOAD4_INC \
MOVSS (Y_PTR), X5 \
MOVSS (Y_PTR)(INC_Y*1), X6 \
MOVSS (Y_PTR)(INC_Y*2), X7 \
MOVSS (Y_PTR)(INC3_Y*1), X8 \
UNPCKLPS X6, X5 \
UNPCKLPS X8, X7 \
MOVLHPS X7, X5
#define KERNEL_LOAD2 \
MOVSD (Y_PTR), X5
#define KERNEL_LOAD2_INC \
MOVSS (Y_PTR), X5 \
MOVSS (Y_PTR)(INC_Y*1), X6 \
UNPCKLPS X6, X5
#define KERNEL_4x8 \
MOVUPS X5, X7 \
MOVUPS X6, X8 \
MOVUPS X5, X9 \
MOVUPS X6, X10 \
MOVUPS X5, X11 \
MOVUPS X6, X12 \
MULPS X1, X5 \
MULPS X1, X6 \
MULPS X2, X7 \
MULPS X2, X8 \
MULPS X3, X9 \
MULPS X3, X10 \
MULPS X4, X11 \
MULPS X4, X12
#define STORE_4x8 \
MOVUPS ALPHA, ALPHA_SPILL \
MOVUPS (A_PTR), X13 \
ADDPS X13, X5 \
MOVUPS 4*SIZE(A_PTR), X14 \
ADDPS X14, X6 \
MOVUPS (A_PTR)(LDA*1), X15 \
ADDPS X15, X7 \
MOVUPS 4*SIZE(A_PTR)(LDA*1), X0 \
ADDPS X0, X8 \
MOVUPS (A_PTR)(LDA*2), X13 \
ADDPS X13, X9 \
MOVUPS 4*SIZE(A_PTR)(LDA*2), X14 \
ADDPS X14, X10 \
MOVUPS (A_PTR)(LDA3*1), X15 \
ADDPS X15, X11 \
MOVUPS 4*SIZE(A_PTR)(LDA3*1), X0 \
ADDPS X0, X12 \
MOVUPS X5, (A_PTR) \
MOVUPS X6, 4*SIZE(A_PTR) \
MOVUPS X7, (A_PTR)(LDA*1) \
MOVUPS X8, 4*SIZE(A_PTR)(LDA*1) \
MOVUPS X9, (A_PTR)(LDA*2) \
MOVUPS X10, 4*SIZE(A_PTR)(LDA*2) \
MOVUPS X11, (A_PTR)(LDA3*1) \
MOVUPS X12, 4*SIZE(A_PTR)(LDA3*1) \
MOVUPS ALPHA_SPILL, ALPHA \
ADDQ $8*SIZE, A_PTR
#define KERNEL_4x4 \
MOVUPS X5, X6 \
MOVUPS X5, X7 \
MOVUPS X5, X8 \
MULPS X1, X5 \
MULPS X2, X6 \
MULPS X3, X7 \
MULPS X4, X8
#define STORE_4x4 \
MOVUPS (A_PTR), X13 \
ADDPS X13, X5 \
MOVUPS (A_PTR)(LDA*1), X14 \
ADDPS X14, X6 \
MOVUPS (A_PTR)(LDA*2), X15 \
ADDPS X15, X7 \
MOVUPS (A_PTR)(LDA3*1), X13 \
ADDPS X13, X8 \
MOVUPS X5, (A_PTR) \
MOVUPS X6, (A_PTR)(LDA*1) \
MOVUPS X7, (A_PTR)(LDA*2) \
MOVUPS X8, (A_PTR)(LDA3*1) \
ADDQ $4*SIZE, A_PTR
#define KERNEL_4x2 \
MOVUPS X5, X6 \
MOVUPS X5, X7 \
MOVUPS X5, X8 \
MULPS X1, X5 \
MULPS X2, X6 \
MULPS X3, X7 \
MULPS X4, X8
#define STORE_4x2 \
MOVSD (A_PTR), X9 \
ADDPS X9, X5 \
MOVSD (A_PTR)(LDA*1), X10 \
ADDPS X10, X6 \
MOVSD (A_PTR)(LDA*2), X11 \
ADDPS X11, X7 \
MOVSD (A_PTR)(LDA3*1), X12 \
ADDPS X12, X8 \
MOVSD X5, (A_PTR) \
MOVSD X6, (A_PTR)(LDA*1) \
MOVSD X7, (A_PTR)(LDA*2) \
MOVSD X8, (A_PTR)(LDA3*1) \
ADDQ $2*SIZE, A_PTR
#define KERNEL_4x1 \
MOVSS (Y_PTR), X5 \
MOVSS X5, X6 \
MOVSS X5, X7 \
MOVSS X5, X8 \
MULSS X1, X5 \
MULSS X2, X6 \
MULSS X3, X7 \
MULSS X4, X8
#define STORE_4x1 \
ADDSS (A_PTR), X5 \
ADDSS (A_PTR)(LDA*1), X6 \
ADDSS (A_PTR)(LDA*2), X7 \
ADDSS (A_PTR)(LDA3*1), X8 \
MOVSS X5, (A_PTR) \
MOVSS X6, (A_PTR)(LDA*1) \
MOVSS X7, (A_PTR)(LDA*2) \
MOVSS X8, (A_PTR)(LDA3*1) \
ADDQ $SIZE, A_PTR
#define KERNEL_2x8 \
MOVUPS X5, X7 \
MOVUPS X6, X8 \
MULPS X1, X5 \
MULPS X1, X6 \
MULPS X2, X7 \
MULPS X2, X8
#define STORE_2x8 \
MOVUPS (A_PTR), X9 \
ADDPS X9, X5 \
MOVUPS 4*SIZE(A_PTR), X10 \
ADDPS X10, X6 \
MOVUPS (A_PTR)(LDA*1), X11 \
ADDPS X11, X7 \
MOVUPS 4*SIZE(A_PTR)(LDA*1), X12 \
ADDPS X12, X8 \
MOVUPS X5, (A_PTR) \
MOVUPS X6, 4*SIZE(A_PTR) \
MOVUPS X7, (A_PTR)(LDA*1) \
MOVUPS X8, 4*SIZE(A_PTR)(LDA*1) \
ADDQ $8*SIZE, A_PTR
#define KERNEL_2x4 \
MOVUPS X5, X6 \
MULPS X1, X5 \
MULPS X2, X6
#define STORE_2x4 \
MOVUPS (A_PTR), X9 \
ADDPS X9, X5 \
MOVUPS (A_PTR)(LDA*1), X11 \
ADDPS X11, X6 \
MOVUPS X5, (A_PTR) \
MOVUPS X6, (A_PTR)(LDA*1) \
ADDQ $4*SIZE, A_PTR
#define KERNEL_2x2 \
MOVSD X5, X6 \
MULPS X1, X5 \
MULPS X2, X6
#define STORE_2x2 \
MOVSD (A_PTR), X7 \
ADDPS X7, X5 \
MOVSD (A_PTR)(LDA*1), X8 \
ADDPS X8, X6 \
MOVSD X5, (A_PTR) \
MOVSD X6, (A_PTR)(LDA*1) \
ADDQ $2*SIZE, A_PTR
#define KERNEL_2x1 \
MOVSS (Y_PTR), X5 \
MOVSS X5, X6 \
MULSS X1, X5 \
MULSS X2, X6
#define STORE_2x1 \
ADDSS (A_PTR), X5 \
ADDSS (A_PTR)(LDA*1), X6 \
MOVSS X5, (A_PTR) \
MOVSS X6, (A_PTR)(LDA*1) \
ADDQ $SIZE, A_PTR
#define KERNEL_1x8 \
MULPS X1, X5 \
MULPS X1, X6
#define STORE_1x8 \
MOVUPS (A_PTR), X7 \
ADDPS X7, X5 \
MOVUPS 4*SIZE(A_PTR), X8 \
ADDPS X8, X6 \
MOVUPS X5, (A_PTR) \
MOVUPS X6, 4*SIZE(A_PTR) \
ADDQ $8*SIZE, A_PTR
#define KERNEL_1x4 \
MULPS X1, X5 \
MULPS X1, X6
#define STORE_1x4 \
MOVUPS (A_PTR), X7 \
ADDPS X7, X5 \
MOVUPS X5, (A_PTR) \
ADDQ $4*SIZE, A_PTR
#define KERNEL_1x2 \
MULPS X1, X5
#define STORE_1x2 \
MOVSD (A_PTR), X6 \
ADDPS X6, X5 \
MOVSD X5, (A_PTR) \
ADDQ $2*SIZE, A_PTR
#define KERNEL_1x1 \
MOVSS (Y_PTR), X5 \
MULSS X1, X5
#define STORE_1x1 \
ADDSS (A_PTR), X5 \
MOVSS X5, (A_PTR) \
ADDQ $SIZE, A_PTR
// func Ger(m, n uintptr, alpha float32,
// x []float32, incX uintptr,
// y []float32, incY uintptr,
// a []float32, lda uintptr)
TEXT ·Ger(SB), 0, $16-120
MOVQ M_DIM, M
MOVQ N_DIM, N
CMPQ M, $0
JE end
CMPQ N, $0
JE end
LOAD_ALPHA
MOVQ x_base+24(FP), X_PTR
MOVQ y_base+56(FP), Y_PTR
MOVQ a_base+88(FP), A_ROW
MOVQ A_ROW, A_PTR
MOVQ lda+112(FP), LDA // LDA = LDA * sizeof(float32)
SHLQ $BITSIZE, LDA
LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3
CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path)
JNE inc
CMPQ incX+48(FP), $1 // Check for dense vector X (fast-path)
JNE inc
SHRQ $2, M
JZ r2
r4:
// LOAD 4
LOAD_SCALED4
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ r4c4
r4c8:
// 4x8 KERNEL
KERNEL_LOAD8
KERNEL_4x8
STORE_4x8
ADDQ $8*SIZE, Y_PTR
DECQ N
JNZ r4c8
r4c4:
TESTQ $4, N_DIM
JZ r4c2
// 4x4 KERNEL
KERNEL_LOAD4
KERNEL_4x4
STORE_4x4
ADDQ $4*SIZE, Y_PTR
r4c2:
TESTQ $2, N_DIM
JZ r4c1
// 4x2 KERNEL
KERNEL_LOAD2
KERNEL_4x2
STORE_4x2
ADDQ $2*SIZE, Y_PTR
r4c1:
TESTQ $1, N_DIM
JZ r4end
// 4x1 KERNEL
KERNEL_4x1
STORE_4x1
ADDQ $SIZE, Y_PTR
r4end:
ADDQ $4*SIZE, X_PTR
MOVQ Y, Y_PTR
LEAQ (A_ROW)(LDA*4), A_ROW
MOVQ A_ROW, A_PTR
DECQ M
JNZ r4
r2:
TESTQ $2, M_DIM
JZ r1
// LOAD 2
LOAD_SCALED2
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ r2c4
r2c8:
// 2x8 KERNEL
KERNEL_LOAD8
KERNEL_2x8
STORE_2x8
ADDQ $8*SIZE, Y_PTR
DECQ N
JNZ r2c8
r2c4:
TESTQ $4, N_DIM
JZ r2c2
// 2x4 KERNEL
KERNEL_LOAD4
KERNEL_2x4
STORE_2x4
ADDQ $4*SIZE, Y_PTR
r2c2:
TESTQ $2, N_DIM
JZ r2c1
// 2x2 KERNEL
KERNEL_LOAD2
KERNEL_2x2
STORE_2x2
ADDQ $2*SIZE, Y_PTR
r2c1:
TESTQ $1, N_DIM
JZ r2end
// 2x1 KERNEL
KERNEL_2x1
STORE_2x1
ADDQ $SIZE, Y_PTR
r2end:
ADDQ $2*SIZE, X_PTR
MOVQ Y, Y_PTR
LEAQ (A_ROW)(LDA*2), A_ROW
MOVQ A_ROW, A_PTR
r1:
TESTQ $1, M_DIM
JZ end
// LOAD 1
LOAD_SCALED1
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ r1c4
r1c8:
// 1x8 KERNEL
KERNEL_LOAD8
KERNEL_1x8
STORE_1x8
ADDQ $8*SIZE, Y_PTR
DECQ N
JNZ r1c8
r1c4:
TESTQ $4, N_DIM
JZ r1c2
// 1x4 KERNEL
KERNEL_LOAD4
KERNEL_1x4
STORE_1x4
ADDQ $4*SIZE, Y_PTR
r1c2:
TESTQ $2, N_DIM
JZ r1c1
// 1x2 KERNEL
KERNEL_LOAD2
KERNEL_1x2
STORE_1x2
ADDQ $2*SIZE, Y_PTR
r1c1:
TESTQ $1, N_DIM
JZ end
// 1x1 KERNEL
KERNEL_1x1
STORE_1x1
end:
RET
inc: // Algorithm for incY != 0 ( split loads in kernel )
MOVQ incX+48(FP), INC_X // INC_X = incX * sizeof(float32)
SHLQ $BITSIZE, INC_X
MOVQ incY+80(FP), INC_Y // INC_Y = incY * sizeof(float32)
SHLQ $BITSIZE, INC_Y
LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
XORQ TMP2, TMP2
MOVQ M, TMP1
SUBQ $1, TMP1
IMULQ INC_X, TMP1
NEGQ TMP1
CMPQ INC_X, $0
CMOVQLT TMP1, TMP2
LEAQ (X_PTR)(TMP2*SIZE), X_PTR
XORQ TMP2, TMP2
MOVQ N, TMP1
SUBQ $1, TMP1
IMULQ INC_Y, TMP1
NEGQ TMP1
CMPQ INC_Y, $0
CMOVQLT TMP1, TMP2
LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR
SHRQ $2, M
JZ inc_r2
inc_r4:
// LOAD 4
LOAD_SCALED4_INC
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ inc_r4c4
inc_r4c8:
// 4x4 KERNEL
KERNEL_LOAD8_INC
KERNEL_4x8
STORE_4x8
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
DECQ N
JNZ inc_r4c8
inc_r4c4:
TESTQ $4, N_DIM
JZ inc_r4c2
// 4x4 KERNEL
KERNEL_LOAD4_INC
KERNEL_4x4
STORE_4x4
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
inc_r4c2:
TESTQ $2, N_DIM
JZ inc_r4c1
// 4x2 KERNEL
KERNEL_LOAD2_INC
KERNEL_4x2
STORE_4x2
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
inc_r4c1:
TESTQ $1, N_DIM
JZ inc_r4end
// 4x1 KERNEL
KERNEL_4x1
STORE_4x1
ADDQ INC_Y, Y_PTR
inc_r4end:
LEAQ (X_PTR)(INC_X*4), X_PTR
MOVQ Y, Y_PTR
LEAQ (A_ROW)(LDA*4), A_ROW
MOVQ A_ROW, A_PTR
DECQ M
JNZ inc_r4
inc_r2:
TESTQ $2, M_DIM
JZ inc_r1
// LOAD 2
LOAD_SCALED2_INC
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ inc_r2c4
inc_r2c8:
// 2x8 KERNEL
KERNEL_LOAD8_INC
KERNEL_2x8
STORE_2x8
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
DECQ N
JNZ inc_r2c8
inc_r2c4:
TESTQ $4, N_DIM
JZ inc_r2c2
// 2x4 KERNEL
KERNEL_LOAD4_INC
KERNEL_2x4
STORE_2x4
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
inc_r2c2:
TESTQ $2, N_DIM
JZ inc_r2c1
// 2x2 KERNEL
KERNEL_LOAD2_INC
KERNEL_2x2
STORE_2x2
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
inc_r2c1:
TESTQ $1, N_DIM
JZ inc_r2end
// 2x1 KERNEL
KERNEL_2x1
STORE_2x1
ADDQ INC_Y, Y_PTR
inc_r2end:
LEAQ (X_PTR)(INC_X*2), X_PTR
MOVQ Y, Y_PTR
LEAQ (A_ROW)(LDA*2), A_ROW
MOVQ A_ROW, A_PTR
inc_r1:
TESTQ $1, M_DIM
JZ end
// LOAD 1
LOAD_SCALED1
MOVQ N_DIM, N
SHRQ $KERNELSIZE, N
JZ inc_r1c4
inc_r1c8:
// 1x8 KERNEL
KERNEL_LOAD8_INC
KERNEL_1x8
STORE_1x8
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
DECQ N
JNZ inc_r1c8
inc_r1c4:
TESTQ $4, N_DIM
JZ inc_r1c2
// 1x4 KERNEL
KERNEL_LOAD4_INC
KERNEL_1x4
STORE_1x4
LEAQ (Y_PTR)(INC_Y*4), Y_PTR
inc_r1c2:
TESTQ $2, N_DIM
JZ inc_r1c1
// 1x2 KERNEL
KERNEL_LOAD2_INC
KERNEL_1x2
STORE_1x2
LEAQ (Y_PTR)(INC_Y*2), Y_PTR
inc_r1c1:
TESTQ $1, N_DIM
JZ inc_end
// 1x1 KERNEL
KERNEL_1x1
STORE_1x1
inc_end:
RET

39
vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go generated vendored Normal file
View File

@@ -0,0 +1,39 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || noasm || gccgo || safe
// +build !amd64 noasm gccgo safe
package f32
// Ger performs the rank-one operation
//
// A += alpha * x * yᵀ
//
// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
func Ger(m, n uintptr, alpha float32, x []float32, incX uintptr, y []float32, incY uintptr, a []float32, lda uintptr) {
if incX == 1 && incY == 1 {
x = x[:m]
y = y[:n]
for i, xv := range x {
AxpyUnitary(alpha*xv, y, a[uintptr(i)*lda:uintptr(i)*lda+n])
}
return
}
var ky, kx uintptr
if int(incY) < 0 {
ky = uintptr(-int(n-1) * int(incY))
}
if int(incX) < 0 {
kx = uintptr(-int(m-1) * int(incX))
}
ix := kx
for i := 0; i < int(m); i++ {
AxpyInc(alpha*x[ix], y, a[uintptr(i)*lda:uintptr(i)*lda+n], uintptr(n), uintptr(incY), 1, uintptr(ky), 0)
ix += incX
}
}

92
vendor/gonum.org/v1/gonum/internal/asm/f32/gemv.go generated vendored Normal file
View File

@@ -0,0 +1,92 @@
// Copyright ©2017 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package f32
// GemvN computes
//
// y = alpha * A * x + beta * y
//
// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
func GemvN(m, n uintptr, alpha float32, a []float32, lda uintptr, x []float32, incX uintptr, beta float32, y []float32, incY uintptr) {
var kx, ky, i uintptr
if int(incX) < 0 {
kx = uintptr(-int(n-1) * int(incX))
}
if int(incY) < 0 {
ky = uintptr(-int(m-1) * int(incY))
}
if incX == 1 && incY == 1 {
if beta == 0 {
for i = 0; i < m; i++ {
y[i] = alpha * DotUnitary(a[lda*i:lda*i+n], x)
}
return
}
for i = 0; i < m; i++ {
y[i] = y[i]*beta + alpha*DotUnitary(a[lda*i:lda*i+n], x)
}
return
}
iy := ky
if beta == 0 {
for i = 0; i < m; i++ {
y[iy] = alpha * DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0)
iy += incY
}
return
}
for i = 0; i < m; i++ {
y[iy] = y[iy]*beta + alpha*DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0)
iy += incY
}
}
// GemvT computes
//
// y = alpha * Aᵀ * x + beta * y
//
// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
func GemvT(m, n uintptr, alpha float32, a []float32, lda uintptr, x []float32, incX uintptr, beta float32, y []float32, incY uintptr) {
var kx, ky, i uintptr
if int(incX) < 0 {
kx = uintptr(-int(m-1) * int(incX))
}
if int(incY) < 0 {
ky = uintptr(-int(n-1) * int(incY))
}
switch {
case beta == 0: // beta == 0 is special-cased to memclear
if incY == 1 {
for i := range y {
y[i] = 0
}
} else {
iy := ky
for i := 0; i < int(n); i++ {
y[iy] = 0
iy += incY
}
}
case int(incY) < 0:
ScalInc(beta, y, n, uintptr(int(-incY)))
case incY == 1:
ScalUnitary(beta, y[:n])
default:
ScalInc(beta, y, n, incY)
}
if incX == 1 && incY == 1 {
for i = 0; i < m; i++ {
AxpyUnitaryTo(y, alpha*x[i], a[lda*i:lda*i+n], y)
}
return
}
ix := kx
for i = 0; i < m; i++ {
AxpyInc(alpha*x[ix], a[lda*i:lda*i+n], y, n, 1, incY, 0, ky)
ix += incX
}
}

90
vendor/gonum.org/v1/gonum/internal/asm/f32/l2norm.go generated vendored Normal file
View File

@@ -0,0 +1,90 @@
// Copyright ©2019 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package f32
import "gonum.org/v1/gonum/internal/math32"
// L2NormUnitary is the level 2 norm of x.
func L2NormUnitary(x []float32) (sum float32) {
var scale float32
var sumSquares float32 = 1
for _, v := range x {
if v == 0 {
continue
}
absxi := math32.Abs(v)
if math32.IsNaN(absxi) {
return math32.NaN()
}
if scale < absxi {
s := scale / absxi
sumSquares = 1 + sumSquares*s*s
scale = absxi
} else {
s := absxi / scale
sumSquares += s * s
}
}
if math32.IsInf(scale, 1) {
return math32.Inf(1)
}
return scale * math32.Sqrt(sumSquares)
}
// L2NormInc is the level 2 norm of x.
func L2NormInc(x []float32, n, incX uintptr) (sum float32) {
var scale float32
var sumSquares float32 = 1
for ix := uintptr(0); ix < n*incX; ix += incX {
val := x[ix]
if val == 0 {
continue
}
absxi := math32.Abs(val)
if math32.IsNaN(absxi) {
return math32.NaN()
}
if scale < absxi {
s := scale / absxi
sumSquares = 1 + sumSquares*s*s
scale = absxi
} else {
s := absxi / scale
sumSquares += s * s
}
}
if math32.IsInf(scale, 1) {
return math32.Inf(1)
}
return scale * math32.Sqrt(sumSquares)
}
// L2DistanceUnitary is the L2 norm of x-y.
func L2DistanceUnitary(x, y []float32) (sum float32) {
var scale float32
var sumSquares float32 = 1
for i, v := range x {
v -= y[i]
if v == 0 {
continue
}
absxi := math32.Abs(v)
if math32.IsNaN(absxi) {
return math32.NaN()
}
if scale < absxi {
s := scale / absxi
sumSquares = 1 + sumSquares*s*s
scale = absxi
} else {
s := absxi / scale
sumSquares += s * s
}
}
if math32.IsInf(scale, 1) {
return math32.Inf(1)
}
return scale * math32.Sqrt(sumSquares)
}

59
vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go generated vendored Normal file
View File

@@ -0,0 +1,59 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package f32
// ScalUnitary is
//
// for i := range x {
// x[i] *= alpha
// }
func ScalUnitary(alpha float32, x []float32) {
for i := range x {
x[i] *= alpha
}
}
// ScalUnitaryTo is
//
// for i, v := range x {
// dst[i] = alpha * v
// }
func ScalUnitaryTo(dst []float32, alpha float32, x []float32) {
for i, v := range x {
dst[i] = alpha * v
}
}
// ScalInc is
//
// var ix uintptr
// for i := 0; i < int(n); i++ {
// x[ix] *= alpha
// ix += incX
// }
func ScalInc(alpha float32, x []float32, n, incX uintptr) {
var ix uintptr
for i := 0; i < int(n); i++ {
x[ix] *= alpha
ix += incX
}
}
// ScalIncTo is
//
// var idst, ix uintptr
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha * x[ix]
// ix += incX
// idst += incDst
// }
func ScalIncTo(dst []float32, incDst uintptr, alpha float32, x []float32, n, incX uintptr) {
var idst, ix uintptr
for i := 0; i < int(n); i++ {
dst[idst] = alpha * x[ix]
ix += incX
idst += incDst
}
}

View File

@@ -0,0 +1,86 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !noasm && !gccgo && !safe
// +build !noasm,!gccgo,!safe
package f32
// AxpyUnitary is
//
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha float32, x, y []float32)
// AxpyUnitaryTo is
//
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
// AxpyInc is
//
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
// AxpyIncTo is
//
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
// DdotUnitary is
//
// for i, v := range x {
// sum += float64(y[i]) * float64(v)
// }
// return
func DdotUnitary(x, y []float32) (sum float64)
// DdotInc is
//
// for i := 0; i < int(n); i++ {
// sum += float64(y[iy]) * float64(x[ix])
// ix += incX
// iy += incY
// }
// return
func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
// DotUnitary is
//
// for i, v := range x {
// sum += y[i] * v
// }
// return sum
func DotUnitary(x, y []float32) (sum float32)
// DotInc is
//
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]
// ix += incX
// iy += incY
// }
// return sum
func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
// Sum is
//
// var sum float32
// for _, v := range x {
// sum += v
// }
// return sum
func Sum(x []float32) float32

View File

@@ -0,0 +1,137 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || noasm || gccgo || safe
// +build !amd64 noasm gccgo safe
package f32
// AxpyUnitary is
//
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha float32, x, y []float32) {
for i, v := range x {
y[i] += alpha * v
}
}
// AxpyUnitaryTo is
//
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) {
for i, v := range x {
dst[i] = alpha*v + y[i]
}
}
// AxpyInc is
//
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
y[iy] += alpha * x[ix]
ix += incX
iy += incY
}
}
// AxpyIncTo is
//
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
dst[idst] = alpha*x[ix] + y[iy]
ix += incX
iy += incY
idst += incDst
}
}
// DotUnitary is
//
// for i, v := range x {
// sum += y[i] * v
// }
// return sum
func DotUnitary(x, y []float32) (sum float32) {
for i, v := range x {
sum += y[i] * v
}
return sum
}
// DotInc is
//
// for i := 0; i < int(n); i++ {
// sum += y[iy] * x[ix]
// ix += incX
// iy += incY
// }
// return sum
func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32) {
for i := 0; i < int(n); i++ {
sum += y[iy] * x[ix]
ix += incX
iy += incY
}
return sum
}
// DdotUnitary is
//
// for i, v := range x {
// sum += float64(y[i]) * float64(v)
// }
// return
func DdotUnitary(x, y []float32) (sum float64) {
for i, v := range x {
sum += float64(y[i]) * float64(v)
}
return
}
// DdotInc is
//
// for i := 0; i < int(n); i++ {
// sum += float64(y[iy]) * float64(x[ix])
// ix += incX
// iy += incY
// }
// return
func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64) {
for i := 0; i < int(n); i++ {
sum += float64(y[iy]) * float64(x[ix])
ix += incX
iy += incY
}
return
}
// Sum is
//
// var sum float32
// for _, v := range x {
// sum += v
// }
// return sum
func Sum(x []float32) float32 {
var sum float32
for _, v := range x {
sum += v
}
return sum
}

100
vendor/gonum.org/v1/gonum/internal/asm/f32/sum_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,100 @@
// Copyright ©2021 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define X_PTR SI
#define IDX AX
#define LEN CX
#define TAIL BX
#define SUM X0
#define SUM_1 X1
#define SUM_2 X2
#define SUM_3 X3
// func Sum(x []float32) float32
TEXT ·Sum(SB), NOSPLIT, $0
MOVQ x_base+0(FP), X_PTR // X_PTR = &x
MOVQ x_len+8(FP), LEN // LEN = len(x)
XORQ IDX, IDX // i = 0
PXOR SUM, SUM // p_sum_i = 0
CMPQ LEN, $0 // if LEN == 0 { return 0 }
JE sum_end
PXOR SUM_1, SUM_1
PXOR SUM_2, SUM_2
PXOR SUM_3, SUM_3
MOVQ X_PTR, TAIL // Check memory alignment
ANDQ $15, TAIL // TAIL = &x % 16
JZ no_trim // if TAIL == 0 { goto no_trim }
SUBQ $16, TAIL // TAIL -= 16
sum_align: // Align on 16-byte boundary do {
ADDSS (X_PTR)(IDX*4), SUM // SUM += x[0]
INCQ IDX // i++
DECQ LEN // LEN--
JZ sum_end // if LEN == 0 { return }
ADDQ $4, TAIL // TAIL += 4
JNZ sum_align // } while TAIL < 0
no_trim:
MOVQ LEN, TAIL
SHRQ $4, LEN // LEN = floor( n / 16 )
JZ sum_tail8 // if LEN == 0 { goto sum_tail8 }
sum_loop: // sum 16x wide do {
ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
ADDPS 16(X_PTR)(IDX*4), SUM_1
ADDPS 32(X_PTR)(IDX*4), SUM_2
ADDPS 48(X_PTR)(IDX*4), SUM_3
ADDQ $16, IDX // i += 16
DECQ LEN
JNZ sum_loop // } while --LEN > 0
sum_tail8:
ADDPS SUM_3, SUM
ADDPS SUM_2, SUM_1
TESTQ $8, TAIL
JZ sum_tail4
ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
ADDPS 16(X_PTR)(IDX*4), SUM_1
ADDQ $8, IDX
sum_tail4:
ADDPS SUM_1, SUM
TESTQ $4, TAIL
JZ sum_tail2
ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
ADDQ $4, IDX
sum_tail2:
HADDPS SUM, SUM // sum_i[:2] += sum_i[2:4]
TESTQ $2, TAIL
JZ sum_tail1
MOVSD (X_PTR)(IDX*4), SUM_1 // reuse SUM_1
ADDPS SUM_1, SUM // sum_i += x[i:i+2]
ADDQ $2, IDX
sum_tail1:
HADDPS SUM, SUM // sum_i[0] += sum_i[1]
TESTQ $1, TAIL
JZ sum_end
ADDSS (X_PTR)(IDX*4), SUM
sum_end: // return sum
MOVSS SUM, ret+24(FP)
RET

View File

@@ -0,0 +1,82 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// func L1Norm(x []float64) float64
TEXT ·L1Norm(SB), NOSPLIT, $0
MOVQ x_base+0(FP), SI // SI = &x
MOVQ x_len+8(FP), CX // CX = len(x)
XORQ AX, AX // i = 0
PXOR X0, X0 // p_sum_i = 0
PXOR X1, X1
PXOR X2, X2
PXOR X3, X3
PXOR X4, X4
PXOR X5, X5
PXOR X6, X6
PXOR X7, X7
CMPQ CX, $0 // if CX == 0 { return 0 }
JE absum_end
MOVQ CX, BX
ANDQ $7, BX // BX = len(x) % 8
SHRQ $3, CX // CX = floor( len(x) / 8 )
JZ absum_tail_start // if CX == 0 { goto absum_tail_start }
absum_loop: // do {
// p_sum += max( p_sum + x[i], p_sum - x[i] )
MOVUPS (SI)(AX*8), X8 // X_i = x[i:i+1]
MOVUPS 16(SI)(AX*8), X9
MOVUPS 32(SI)(AX*8), X10
MOVUPS 48(SI)(AX*8), X11
ADDPD X8, X0 // p_sum_i += X_i ( positive values )
ADDPD X9, X2
ADDPD X10, X4
ADDPD X11, X6
SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values )
SUBPD X9, X3
SUBPD X10, X5
SUBPD X11, X7
MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) )
MAXPD X3, X2
MAXPD X5, X4
MAXPD X7, X6
MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i
MOVAPS X2, X3
MOVAPS X4, X5
MOVAPS X6, X7
ADDQ $8, AX // i += 8
LOOP absum_loop // } while --CX > 0
// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
ADDPD X3, X0
ADDPD X5, X7
ADDPD X7, X0
// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
MOVAPS X0, X1
SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
ADDSD X1, X0
CMPQ BX, $0
JE absum_end // if BX == 0 { goto absum_end }
absum_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
XORPS X8, X8 // X_8 = 0
absum_tail: // do {
// p_sum += max( p_sum + x[i], p_sum - x[i] )
MOVSD (SI)(AX*8), X8 // X_8 = x[i]
MOVSD X0, X1 // p_sum_1 = p_sum_0
ADDSD X8, X0 // p_sum_0 += X_8
SUBSD X8, X1 // p_sum_1 -= X_8
MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 )
INCQ AX // i++
LOOP absum_tail // } while --CX > 0
absum_end: // return p_sum_0
MOVSD X0, sum+24(FP)
RET

View File

@@ -0,0 +1,90 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// func L1NormInc(x []float64, n, incX int) (sum float64)
TEXT ·L1NormInc(SB), NOSPLIT, $0
MOVQ x_base+0(FP), SI // SI = &x
MOVQ n+24(FP), CX // CX = n
MOVQ incX+32(FP), AX // AX = increment * sizeof( float64 )
SHLQ $3, AX
MOVQ AX, DX // DX = AX * 3
IMULQ $3, DX
PXOR X0, X0 // p_sum_i = 0
PXOR X1, X1
PXOR X2, X2
PXOR X3, X3
PXOR X4, X4
PXOR X5, X5
PXOR X6, X6
PXOR X7, X7
CMPQ CX, $0 // if CX == 0 { return 0 }
JE absum_end
MOVQ CX, BX
ANDQ $7, BX // BX = n % 8
SHRQ $3, CX // CX = floor( n / 8 )
JZ absum_tail_start // if CX == 0 { goto absum_tail_start }
absum_loop: // do {
// p_sum = max( p_sum + x[i], p_sum - x[i] )
MOVSD (SI), X8 // X_i[0] = x[i]
MOVSD (SI)(AX*1), X9
MOVSD (SI)(AX*2), X10
MOVSD (SI)(DX*1), X11
LEAQ (SI)(AX*4), SI // SI = SI + 4
MOVHPD (SI), X8 // X_i[1] = x[i+4]
MOVHPD (SI)(AX*1), X9
MOVHPD (SI)(AX*2), X10
MOVHPD (SI)(DX*1), X11
ADDPD X8, X0 // p_sum_i += X_i ( positive values )
ADDPD X9, X2
ADDPD X10, X4
ADDPD X11, X6
SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values )
SUBPD X9, X3
SUBPD X10, X5
SUBPD X11, X7
MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) )
MAXPD X3, X2
MAXPD X5, X4
MAXPD X7, X6
MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i
MOVAPS X2, X3
MOVAPS X4, X5
MOVAPS X6, X7
LEAQ (SI)(AX*4), SI // SI = SI + 4
LOOP absum_loop // } while --CX > 0
// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
ADDPD X3, X0
ADDPD X5, X7
ADDPD X7, X0
// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
MOVAPS X0, X1
SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
ADDSD X1, X0
CMPQ BX, $0
JE absum_end // if BX == 0 { goto absum_end }
absum_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
XORPS X8, X8 // X_8 = 0
absum_tail: // do {
// p_sum += max( p_sum + x[i], p_sum - x[i] )
MOVSD (SI), X8 // X_8 = x[i]
MOVSD X0, X1 // p_sum_1 = p_sum_0
ADDSD X8, X0 // p_sum_0 += X_8
SUBSD X8, X1 // p_sum_1 -= X_8
MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 )
ADDQ AX, SI // i++
LOOP absum_tail // } while --CX > 0
absum_end: // return p_sum_0
MOVSD X0, sum+40(FP)
RET

66
vendor/gonum.org/v1/gonum/internal/asm/f64/add_amd64.s generated vendored Normal file
View File

@@ -0,0 +1,66 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// func Add(dst, s []float64)
TEXT ·Add(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DI // DI = &dst
MOVQ dst_len+8(FP), CX // CX = len(dst)
MOVQ s_base+24(FP), SI // SI = &s
CMPQ s_len+32(FP), CX // CX = max( CX, len(s) )
CMOVQLE s_len+32(FP), CX
CMPQ CX, $0 // if CX == 0 { return }
JE add_end
XORQ AX, AX
MOVQ DI, BX
ANDQ $0x0F, BX // BX = &dst & 15
JZ add_no_trim // if BX == 0 { goto add_no_trim }
// Align on 16-bit boundary
MOVSD (SI)(AX*8), X0 // X0 = s[i]
ADDSD (DI)(AX*8), X0 // X0 += dst[i]
MOVSD X0, (DI)(AX*8) // dst[i] = X0
INCQ AX // i++
DECQ CX // --CX
JE add_end // if CX == 0 { return }
add_no_trim:
MOVQ CX, BX
ANDQ $7, BX // BX = len(dst) % 8
SHRQ $3, CX // CX = floor( len(dst) / 8 )
JZ add_tail_start // if CX == 0 { goto add_tail_start }
add_loop: // Loop unrolled 8x do {
MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1]
MOVUPS 16(SI)(AX*8), X1
MOVUPS 32(SI)(AX*8), X2
MOVUPS 48(SI)(AX*8), X3
ADDPD (DI)(AX*8), X0 // X_i += dst[i:i+1]
ADDPD 16(DI)(AX*8), X1
ADDPD 32(DI)(AX*8), X2
ADDPD 48(DI)(AX*8), X3
MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X_i
MOVUPS X1, 16(DI)(AX*8)
MOVUPS X2, 32(DI)(AX*8)
MOVUPS X3, 48(DI)(AX*8)
ADDQ $8, AX // i += 8
LOOP add_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE add_end
add_tail_start: // Reset loop registers
MOVQ BX, CX // Loop counter: CX = BX
add_tail: // do {
MOVSD (SI)(AX*8), X0 // X0 = s[i]
ADDSD (DI)(AX*8), X0 // X0 += dst[i]
MOVSD X0, (DI)(AX*8) // dst[i] = X0
INCQ AX // ++i
LOOP add_tail // } while --CX > 0
add_end:
RET

View File

@@ -0,0 +1,53 @@
// Copyright ©2016 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
// func Addconst(alpha float64, x []float64)
TEXT ·AddConst(SB), NOSPLIT, $0
MOVQ x_base+8(FP), SI // SI = &x
MOVQ x_len+16(FP), CX // CX = len(x)
CMPQ CX, $0 // if len(x) == 0 { return }
JE ac_end
MOVSD alpha+0(FP), X4 // X4 = { a, a }
SHUFPD $0, X4, X4
MOVUPS X4, X5 // X5 = X4
XORQ AX, AX // i = 0
MOVQ CX, BX
ANDQ $7, BX // BX = len(x) % 8
SHRQ $3, CX // CX = floor( len(x) / 8 )
JZ ac_tail_start // if CX == 0 { goto ac_tail_start }
ac_loop: // Loop unrolled 8x do {
MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1]
MOVUPS 16(SI)(AX*8), X1
MOVUPS 32(SI)(AX*8), X2
MOVUPS 48(SI)(AX*8), X3
ADDPD X4, X0 // X_i += a
ADDPD X5, X1
ADDPD X4, X2
ADDPD X5, X3
MOVUPS X0, (SI)(AX*8) // s[i:i+1] = X_i
MOVUPS X1, 16(SI)(AX*8)
MOVUPS X2, 32(SI)(AX*8)
MOVUPS X3, 48(SI)(AX*8)
ADDQ $8, AX // i += 8
LOOP ac_loop // } while --CX > 0
CMPQ BX, $0 // if BX == 0 { return }
JE ac_end
ac_tail_start: // Reset loop counters
MOVQ BX, CX // Loop counter: CX = BX
ac_tail: // do {
MOVSD (SI)(AX*8), X0 // X0 = s[i]
ADDSD X4, X0 // X0 += a
MOVSD X0, (SI)(AX*8) // s[i] = X0
INCQ AX // ++i
LOOP ac_tail // } while --CX > 0
ac_end:
RET

62
vendor/gonum.org/v1/gonum/internal/asm/f64/axpy.go generated vendored Normal file
View File

@@ -0,0 +1,62 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:build !amd64 || noasm || gccgo || safe
// +build !amd64 noasm gccgo safe
package f64
// AxpyUnitary is
//
// for i, v := range x {
// y[i] += alpha * v
// }
func AxpyUnitary(alpha float64, x, y []float64) {
for i, v := range x {
y[i] += alpha * v
}
}
// AxpyUnitaryTo is
//
// for i, v := range x {
// dst[i] = alpha*v + y[i]
// }
func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) {
for i, v := range x {
dst[i] = alpha*v + y[i]
}
}
// AxpyInc is
//
// for i := 0; i < int(n); i++ {
// y[iy] += alpha * x[ix]
// ix += incX
// iy += incY
// }
func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
y[iy] += alpha * x[ix]
ix += incX
iy += incY
}
}
// AxpyIncTo is
//
// for i := 0; i < int(n); i++ {
// dst[idst] = alpha*x[ix] + y[iy]
// ix += incX
// iy += incY
// idst += incDst
// }
func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) {
for i := 0; i < int(n); i++ {
dst[idst] = alpha*x[ix] + y[iy]
ix += incX
iy += incY
idst += incDst
}
}

View File

@@ -0,0 +1,142 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
// Some of the loop unrolling code is copied from:
// http://golang.org/src/math/big/arith_amd64.s
// which is distributed under these terms:
//
// Copyright (c) 2012 The Go Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define X_PTR SI
#define Y_PTR DI
#define DST_PTR DI
#define IDX AX
#define LEN CX
#define TAIL BX
#define INC_X R8
#define INCx3_X R11
#define INC_Y R9
#define INCx3_Y R12
#define INC_DST R9
#define INCx3_DST R12
#define ALPHA X0
#define ALPHA_2 X1
// func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyInc(SB), NOSPLIT, $0
MOVQ x_base+8(FP), X_PTR // X_PTR = &x
MOVQ y_base+32(FP), Y_PTR // Y_PTR = &y
MOVQ n+56(FP), LEN // LEN = n
CMPQ LEN, $0 // if LEN == 0 { return }
JE end
MOVQ ix+80(FP), INC_X
MOVQ iy+88(FP), INC_Y
LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(x[ix])
LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(y[iy])
MOVQ Y_PTR, DST_PTR // DST_PTR = Y_PTR // Write pointer
MOVQ incX+64(FP), INC_X // INC_X = incX * sizeof(float64)
SHLQ $3, INC_X
MOVQ incY+72(FP), INC_Y // INC_Y = incY * sizeof(float64)
SHLQ $3, INC_Y
MOVSD alpha+0(FP), ALPHA // ALPHA = alpha
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = n % 4
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ tail_start // if LEN == 0 { goto tail_start }
MOVAPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
loop: // do { // y[i] += alpha * x[i] unrolled 4x.
MOVSD (X_PTR), X2 // X_i = x[i]
MOVSD (X_PTR)(INC_X*1), X3
MOVSD (X_PTR)(INC_X*2), X4
MOVSD (X_PTR)(INCx3_X*1), X5
MULSD ALPHA, X2 // X_i *= a
MULSD ALPHA_2, X3
MULSD ALPHA, X4
MULSD ALPHA_2, X5
ADDSD (Y_PTR), X2 // X_i += y[i]
ADDSD (Y_PTR)(INC_Y*1), X3
ADDSD (Y_PTR)(INC_Y*2), X4
ADDSD (Y_PTR)(INCx3_Y*1), X5
MOVSD X2, (DST_PTR) // y[i] = X_i
MOVSD X3, (DST_PTR)(INC_DST*1)
MOVSD X4, (DST_PTR)(INC_DST*2)
MOVSD X5, (DST_PTR)(INCx3_DST*1)
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
DECQ LEN
JNZ loop // } while --LEN > 0
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE end
tail_start: // Reset Loop registers
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
SHRQ $1, LEN // LEN = floor( LEN / 2 )
JZ tail_one
tail_two:
MOVSD (X_PTR), X2 // X_i = x[i]
MOVSD (X_PTR)(INC_X*1), X3
MULSD ALPHA, X2 // X_i *= a
MULSD ALPHA, X3
ADDSD (Y_PTR), X2 // X_i += y[i]
ADDSD (Y_PTR)(INC_Y*1), X3
MOVSD X2, (DST_PTR) // y[i] = X_i
MOVSD X3, (DST_PTR)(INC_DST*1)
LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2])
ANDQ $1, TAIL
JZ end // if TAIL == 0 { goto end }
tail_one:
// y[i] += alpha * x[i] for the last n % 4 iterations.
MOVSD (X_PTR), X2 // X2 = x[i]
MULSD ALPHA, X2 // X2 *= a
ADDSD (Y_PTR), X2 // X2 += y[i]
MOVSD X2, (DST_PTR) // y[i] = X2
end:
RET

View File

@@ -0,0 +1,148 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
// Some of the loop unrolling code is copied from:
// http://golang.org/src/math/big/arith_amd64.s
// which is distributed under these terms:
//
// Copyright (c) 2012 The Go Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define X_PTR SI
#define Y_PTR DI
#define DST_PTR DX
#define IDX AX
#define LEN CX
#define TAIL BX
#define INC_X R8
#define INCx3_X R11
#define INC_Y R9
#define INCx3_Y R12
#define INC_DST R10
#define INCx3_DST R13
#define ALPHA X0
#define ALPHA_2 X1
// func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
TEXT ·AxpyIncTo(SB), NOSPLIT, $0
MOVQ dst_base+0(FP), DST_PTR // DST_PTR := &dst
MOVQ x_base+48(FP), X_PTR // X_PTR := &x
MOVQ y_base+72(FP), Y_PTR // Y_PTR := &y
MOVQ n+96(FP), LEN // LEN := n
CMPQ LEN, $0 // if LEN == 0 { return }
JE end
MOVQ ix+120(FP), INC_X
LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(x[ix])
MOVQ iy+128(FP), INC_Y
LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(dst[idst])
MOVQ idst+32(FP), INC_DST
LEAQ (DST_PTR)(INC_DST*8), DST_PTR // DST_PTR = &(y[iy])
MOVQ incX+104(FP), INC_X // INC_X = incX * sizeof(float64)
SHLQ $3, INC_X
MOVQ incY+112(FP), INC_Y // INC_Y = incY * sizeof(float64)
SHLQ $3, INC_Y
MOVQ incDst+24(FP), INC_DST // INC_DST = incDst * sizeof(float64)
SHLQ $3, INC_DST
MOVSD alpha+40(FP), ALPHA
MOVQ LEN, TAIL
ANDQ $3, TAIL // TAIL = n % 4
SHRQ $2, LEN // LEN = floor( n / 4 )
JZ tail_start // if LEN == 0 { goto tail_start }
MOVSD ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining
LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
LEAQ (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3
loop: // do { // y[i] += alpha * x[i] unrolled 2x.
MOVSD (X_PTR), X2 // X_i = x[i]
MOVSD (X_PTR)(INC_X*1), X3
MOVSD (X_PTR)(INC_X*2), X4
MOVSD (X_PTR)(INCx3_X*1), X5
MULSD ALPHA, X2 // X_i *= a
MULSD ALPHA_2, X3
MULSD ALPHA, X4
MULSD ALPHA_2, X5
ADDSD (Y_PTR), X2 // X_i += y[i]
ADDSD (Y_PTR)(INC_Y*1), X3
ADDSD (Y_PTR)(INC_Y*2), X4
ADDSD (Y_PTR)(INCx3_Y*1), X5
MOVSD X2, (DST_PTR) // y[i] = X_i
MOVSD X3, (DST_PTR)(INC_DST*1)
MOVSD X4, (DST_PTR)(INC_DST*2)
MOVSD X5, (DST_PTR)(INCx3_DST*1)
LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4]
DECQ LEN
JNZ loop // } while --LEN > 0
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE end
tail_start: // Reset Loop registers
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
SHRQ $1, LEN // LEN = floor( LEN / 2 )
JZ tail_one
tail_two:
MOVSD (X_PTR), X2 // X_i = x[i]
MOVSD (X_PTR)(INC_X*1), X3
MULSD ALPHA, X2 // X_i *= a
MULSD ALPHA, X3
ADDSD (Y_PTR), X2 // X_i += y[i]
ADDSD (Y_PTR)(INC_Y*1), X3
MOVSD X2, (DST_PTR) // y[i] = X_i
MOVSD X3, (DST_PTR)(INC_DST*1)
LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2])
LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incY*2]
ANDQ $1, TAIL
JZ end // if TAIL == 0 { goto end }
tail_one:
MOVSD (X_PTR), X2 // X2 = x[i]
MULSD ALPHA, X2 // X2 *= a
ADDSD (Y_PTR), X2 // X2 += y[i]
MOVSD X2, (DST_PTR) // y[i] = X2
end:
RET

View File

@@ -0,0 +1,134 @@
// Copyright ©2015 The Gonum Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//
// Some of the loop unrolling code is copied from:
// http://golang.org/src/math/big/arith_amd64.s
// which is distributed under these terms:
//
// Copyright (c) 2012 The Go Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
// +build !noasm,!gccgo,!safe
#include "textflag.h"
#define X_PTR SI
#define Y_PTR DI
#define DST_PTR DI
#define IDX AX
#define LEN CX
#define TAIL BX
#define ALPHA X0
#define ALPHA_2 X1
// func AxpyUnitary(alpha float64, x, y []float64)
TEXT ·AxpyUnitary(SB), NOSPLIT, $0
MOVQ x_base+8(FP), X_PTR // X_PTR := &x
MOVQ y_base+32(FP), Y_PTR // Y_PTR := &y
MOVQ x_len+16(FP), LEN // LEN = min( len(x), len(y) )
CMPQ y_len+40(FP), LEN
CMOVQLE y_len+40(FP), LEN
CMPQ LEN, $0 // if LEN == 0 { return }
JE end
XORQ IDX, IDX
MOVSD alpha+0(FP), ALPHA // ALPHA := { alpha, alpha }
SHUFPD $0, ALPHA, ALPHA
MOVUPS ALPHA, ALPHA_2 // ALPHA_2 := ALPHA for pipelining
MOVQ Y_PTR, TAIL // Check memory alignment
ANDQ $15, TAIL // TAIL = &y % 16
JZ no_trim // if TAIL == 0 { goto no_trim }
// Align on 16-byte boundary
MOVSD (X_PTR), X2 // X2 := x[0]
MULSD ALPHA, X2 // X2 *= a
ADDSD (Y_PTR), X2 // X2 += y[0]
MOVSD X2, (DST_PTR) // y[0] = X2
INCQ IDX // i++
DECQ LEN // LEN--
JZ end // if LEN == 0 { return }
no_trim:
MOVQ LEN, TAIL
ANDQ $7, TAIL // TAIL := n % 8
SHRQ $3, LEN // LEN = floor( n / 8 )
JZ tail_start // if LEN == 0 { goto tail2_start }
loop: // do {
// y[i] += alpha * x[i] unrolled 8x.
MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
MOVUPS 16(X_PTR)(IDX*8), X3
MOVUPS 32(X_PTR)(IDX*8), X4
MOVUPS 48(X_PTR)(IDX*8), X5
MULPD ALPHA, X2 // X_i *= a
MULPD ALPHA_2, X3
MULPD ALPHA, X4
MULPD ALPHA_2, X5
ADDPD (Y_PTR)(IDX*8), X2 // X_i += y[i]
ADDPD 16(Y_PTR)(IDX*8), X3
ADDPD 32(Y_PTR)(IDX*8), X4
ADDPD 48(Y_PTR)(IDX*8), X5
MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X_i
MOVUPS X3, 16(DST_PTR)(IDX*8)
MOVUPS X4, 32(DST_PTR)(IDX*8)
MOVUPS X5, 48(DST_PTR)(IDX*8)
ADDQ $8, IDX // i += 8
DECQ LEN
JNZ loop // } while --LEN > 0
CMPQ TAIL, $0 // if TAIL == 0 { return }
JE end
tail_start: // Reset loop registers
MOVQ TAIL, LEN // Loop counter: LEN = TAIL
SHRQ $1, LEN // LEN = floor( TAIL / 2 )
JZ tail_one // if TAIL == 0 { goto tail }
tail_two: // do {
MOVUPS (X_PTR)(IDX*8), X2 // X2 = x[i]
MULPD ALPHA, X2 // X2 *= a
ADDPD (Y_PTR)(IDX*8), X2 // X2 += y[i]
MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X2
ADDQ $2, IDX // i += 2
DECQ LEN
JNZ tail_two // } while --LEN > 0
ANDQ $1, TAIL
JZ end // if TAIL == 0 { goto end }
tail_one:
MOVSD (X_PTR)(IDX*8), X2 // X2 = x[i]
MULSD ALPHA, X2 // X2 *= a
ADDSD (Y_PTR)(IDX*8), X2 // X2 += y[i]
MOVSD X2, (DST_PTR)(IDX*8) // y[i] = X2
end:
RET

Some files were not shown because too many files have changed in this diff Show More