fixed dependencies

2024-10-24 15:46:01 +08:00
parent d16a5bd9c0
commit 1161e8d054
2005 changed files with 690883 additions and 0 deletions
--- a/vendor/gonum.org/v1/gonum/AUTHORS
+++ b/vendor/gonum.org/v1/gonum/AUTHORS
@@ -0,0 +1,132 @@
+# This is the official list of Gonum authors for copyright purposes.
+# This file is distinct from the CONTRIBUTORS files.
+# See the latter for an explanation.
+
+# Names should be added to this file as
+#	Name or Organization <email address>
+# The email address is not required for organizations.
+
+# Please keep the list sorted.
+
+Alexander Egurnov <alexander.egurnov@gmail.com>
+Andrei Blinnikov <goofinator@mail.ru>
+antichris <chris@u-d13.com>
+Bailey Lissington <lissington4@gmail.com>
+Bill Gray <wgray@gogray.com>
+Bill Noon <noon.bill@gmail.com>
+Brendan Tracey <tracey.brendan@gmail.com>
+Brent Pedersen <bpederse@gmail.com>
+Bulat Khasanov <afti@yandex.ru>
+Chad Kunde <kunde21@gmail.com>
+Chan Kwan Yin <sofe2038@gmail.com>
+Chih-Wei Chang <bert.cwchang@gmail.com>
+Chong-Yeol Nah <nahchongyeol@gmail.com>
+Chris Tessum <ctessum@gmail.com>
+Christophe Meessen <christophe.meessen@gmail.com>
+Christopher Waldon <christopher.waldon.dev@gmail.com>
+Clayton Northey <clayton.northey@gmail.com>
+Dan Kortschak <dan.kortschak@adelaide.edu.au> <dan@kortschak.io>
+Daniel Fireman <danielfireman@gmail.com>
+Dario Heinisch <dario.heinisch@gmail.com>
+David Kleiven <davidkleiven446@gmail.com>
+David Samborski <bloggingarrow@gmail.com>
+Davor Kapsa <davor.kapsa@gmail.com>
+DeepMind Technologies
+Delaney Gillilan <delaneygillilan@gmail.com>
+Dezmond Goff <goff.dezmond@gmail.com>
+Dong-hee Na <donghee.na92@gmail.com>
+Dustin Spicuzza <dustin@virtualroadside.com>
+Egon Elbre <egonelbre@gmail.com>
+Ekaterina Efimova <katerina.efimova@gmail.com>
+Eng Zer Jun <engzerjun@gmail.com>
+Ethan Burns <burns.ethan@gmail.com>
+Ethan Reesor <ethan.reesor@gmail.com>
+Evert Lammerts <evert.lammerts@gmail.com>
+Evgeny Savinov <notime.sea@gmail.com>
+Fabian Wickborn <fabian@wickborn.net>
+Facundo Gaich <facugaich@gmail.com>
+Fazlul Shahriar <fshahriar@gmail.com>
+Francesc Campoy <campoy@golang.org>
+Google Inc
+Gustaf Johansson <gustaf@pinon.se>
+Hossein Zolfi <hossein.zolfi@gmail.com>
+Iakov Davydov <iakov.davydov@unil.ch>
+Igor Mikushkin <igor.mikushkin@gmail.com>
+Iskander Sharipov <quasilyte@gmail.com>
+Jalem Raj Rohit <jrajrohit33@gmail.com>
+James Bell <james@stellentus.com>
+James Bowman <james.edward.bowman@gmail.com>
+James Holmes <32bitkid@gmail.com>
+Janne Snabb <snabb@epipe.com>
+Jeremy Atkinson <jchatkinson@gmail.com>
+Jes Cok <xigua67damn@gmail.com>
+Jinesi Yelizati <i63888888@163.com>
+Jonas Kahler <jonas@derkahler.de>
+Jonas Schulze <jonas.schulze@ovgu.de>
+Jonathan Bluett-Duncan <jbluettduncan@gmail.com>
+Jonathan J Lawlor <jonathan.lawlor@gmail.com>
+Jonathan Reiter <jonreiter@gmail.com>
+Jonathan Schroeder <jd.schroeder@gmail.com>
+Joost van Amersfoort <git@joo.st>
+Jordan Stoker <jordan_stoker@hotmail.com>
+Joseph Watson <jtwatson@linux-consulting.us>
+Josh Wilson <josh.craig.wilson@gmail.com>
+Julien Roland <juroland@gmail.com>
+Kai Trukenmüller <ktye78@gmail.com>
+Kent English <kent.english@gmail.com>
+Kevin C. Zimmerman <kevinczimmerman@gmail.com>
+Kirill Motkov <motkov.kirill@gmail.com>
+Konstantin Shaposhnikov <k.shaposhnikov@gmail.com>
+Leonid Kneller <recondite.matter@gmail.com>
+Lyron Winderbaum <lyron.winderbaum@student.adelaide.edu.au> <armadilloa16@gmail.com> <lyron.winderbaum@uwa.edu.au>
+Marco Leogrande <dark.knight.ita@gmail.com>
+Mark Canning <argusdusty@gmail.com>
+Mark Skilbeck <markskilbeck@gmail.com>
+Martin Diz <github@martindiz.com.ar>
+Matthew Connelly <matthew.b.connelly@gmail.com>
+Matthieu Di Mercurio <matthieu.dimercurio@gmail.com>
+Max Halford <maxhalford25@gmail.com>
+Maxim Sergeev <gudvinr@gmail.com>
+Microsoft Corporation
+MinJae Kwon <k239507@gmail.com>
+Nathan Edwards <etaoinshrdluwho@gmail.com>
+Nick Potts <nick@the-potts.com>
+Nils Wogatzky <odog@netcologne.de>
+Olivier Wulveryck <olivier.wulveryck@gmail.com>
+Or Rikon <rikonor@gmail.com>
+Patricio Whittingslow <graded.sp@gmail.com>
+Patrick DeVivo <patrick@tickgit.com>
+Pontus Melke <pontusmelke@gmail.com>
+Renee French
+Rishi Desai <desai.rishi1@gmail.com>
+Robin Eklind <r.eklind.87@gmail.com>
+Roger Welin <roger.welin@icloud.com>
+Rondall Jones <rejones7@gmail.com>
+Sam Zaydel <szaydel@gmail.com>
+Samuel Kelemen <Samuel@Kelemen.us>
+Saran Ahluwalia <ahlusar.ahluwalia@gmail.com>
+Scott Holden <scott@sshconnection.com>
+Scott Kiesel <kiesel.scott@gmail.com>
+Sebastien Binet <seb.binet@gmail.com>
+Shawn Smith <shawnpsmith@gmail.com>
+Sintela Ltd
+source{d} <hello@sourced.tech>
+Spencer Lyon <spencerlyon2@gmail.com>
+Steve McCoy <mccoyst@gmail.com>
+Taesu Pyo <pyotaesu@gmail.com>
+Takeshi Yoneda <cz.rk.t0415y.g@gmail.com>
+Tamir Hyman <hyman.tamir@gmail.com>
+The University of Adelaide
+The University of Minnesota
+The University of Washington
+Thomas Berg <tomfuture@gmail.com>
+Tobin Harding <me@tobin.cc>
+Valentin Deleplace <deleplace2015@gmail.com>
+Vincent Thiery <vjmthiery@gmail.com>
+Vladimír Chalupecký <vladimir.chalupecky@gmail.com>
+Will Tekulve <tekulve.will@gmail.com>
+Yasuhiro Matsumoto <mattn.jp@gmail.com>
+Yevgeniy Vahlis <evahlis@gmail.com>
+Yucheng Zhu <zyctc000@gmail.com>
+Yunomi <ynmtywn@gmail.com>
+Zoe Juozapaitis
--- a/vendor/gonum.org/v1/gonum/CONTRIBUTORS
+++ b/vendor/gonum.org/v1/gonum/CONTRIBUTORS
@@ -0,0 +1,135 @@
+# This is the official list of people who can contribute
+# (and typically have contributed) code to the Gonum
+# project.
+#
+# The AUTHORS file lists the copyright holders; this file
+# lists people.  For example, Google employees would be listed here
+# but not in AUTHORS, because Google would hold the copyright.
+#
+# When adding J Random Contributor's name to this file,
+# either J's name or J's organization's name should be
+# added to the AUTHORS file.
+#
+# Names should be added to this file like so:
+#     Name <email address>
+#
+# Please keep the list sorted.
+
+Alexander Egurnov <alexander.egurnov@gmail.com>
+Andrei Blinnikov <goofinator@mail.ru>
+Andrew Brampton <brampton@gmail.com>
+antichris <chris@u-d13.com>
+Bailey Lissington <lissington4@gmail.com>
+Bill Gray <wgray@gogray.com>
+Bill Noon <noon.bill@gmail.com>
+Brendan Tracey <tracey.brendan@gmail.com>
+Brent Pedersen <bpederse@gmail.com>
+Bulat Khasanov <afti@yandex.ru>
+Chad Kunde <kunde21@gmail.com>
+Chan Kwan Yin <sofe2038@gmail.com>
+Chih-Wei Chang <bert.cwchang@gmail.com>
+Chong-Yeol Nah <nahchongyeol@gmail.com>
+Chris Tessum <ctessum@gmail.com>
+Christophe Meessen <christophe.meessen@gmail.com>
+Christopher Waldon <christopher.waldon.dev@gmail.com>
+Clayton Northey <clayton.northey@gmail.com>
+Dan Kortschak <dan.kortschak@adelaide.edu.au> <dan@kortschak.io>
+Dan Lorenc <lorenc.d@gmail.com>
+Daniel Fireman <danielfireman@gmail.com>
+Dario Heinisch <dario.heinisch@gmail.com>
+David Kleiven <davidkleiven446@gmail.com>
+David Samborski <bloggingarrow@gmail.com>
+Davor Kapsa <davor.kapsa@gmail.com>
+Delaney Gillilan <delaneygillilan@gmail.com>
+Dezmond Goff <goff.dezmond@gmail.com>
+Dong-hee Na <donghee.na92@gmail.com>
+Dustin Spicuzza <dustin@virtualroadside.com>
+Egon Elbre <egonelbre@gmail.com>
+Ekaterina Efimova <katerina.efimova@gmail.com>
+Eng Zer Jun <engzerjun@gmail.com>
+Ethan Burns <burns.ethan@gmail.com>
+Ethan Reesor <ethan.reesor@gmail.com>
+Evert Lammerts <evert.lammerts@gmail.com>
+Evgeny Savinov <notime.sea@gmail.com>
+Fabian Wickborn <fabian@wickborn.net>
+Facundo Gaich <facugaich@gmail.com>
+Fazlul Shahriar <fshahriar@gmail.com>
+Francesc Campoy <campoy@golang.org>
+Gustaf Johansson <gustaf@pinon.se>
+Hossein Zolfi <hossein.zolfi@gmail.com>
+Iakov Davydov <iakov.davydov@unil.ch>
+Igor Mikushkin <igor.mikushkin@gmail.com>
+Iskander Sharipov <quasilyte@gmail.com>
+Jalem Raj Rohit <jrajrohit33@gmail.com>
+James Bell <james@stellentus.com>
+James Bowman <james.edward.bowman@gmail.com>
+James Holmes <32bitkid@gmail.com>
+Janne Snabb <snabb@epipe.com>
+Jeremy Atkinson <jchatkinson@gmail.com>
+Jes Cok <xigua67damn@gmail.com>
+Jinesi Yelizati <i63888888@163.com>
+Jon Richards <noj.richards@gmail.com>
+Jonas Kahler <jonas@derkahler.de>
+Jonas Schulze <jonas.schulze@ovgu.de>
+Jonathan Bluett-Duncan <jbluettduncan@gmail.com>
+Jonathan J Lawlor <jonathan.lawlor@gmail.com>
+Jonathan Reiter <jonreiter@gmail.com>
+Jonathan Schroeder <jd.schroeder@gmail.com>
+Joost van Amersfoort <git@joo.st>
+Jordan Stoker <jordan_stoker@hotmail.com>
+Joseph Watson <jtwatson@linux-consulting.us>
+Josh Wilson <josh.craig.wilson@gmail.com>
+Julien Roland <juroland@gmail.com>
+Kai Trukenmüller <ktye78@gmail.com>
+Kent English <kent.english@gmail.com>
+Kevin C. Zimmerman <kevinczimmerman@gmail.com>
+Kirill Motkov <motkov.kirill@gmail.com>
+Konstantin Shaposhnikov <k.shaposhnikov@gmail.com>
+Leonid Kneller <recondite.matter@gmail.com>
+Lyron Winderbaum <lyron.winderbaum@student.adelaide.edu.au> <armadilloa16@gmail.com> <lyron.winderbaum@uwa.edu.au>
+Marco Leogrande <dark.knight.ita@gmail.com>
+Mark Canning <argusdusty@gmail.com>
+Mark Skilbeck <markskilbeck@gmail.com>
+Martin Diz <github@martindiz.com.ar>
+Matthew Connelly <matthew.b.connelly@gmail.com>
+Matthieu Di Mercurio <matthieu.dimercurio@gmail.com>
+Max Halford <maxhalford25@gmail.com>
+Maxim Sergeev <gudvinr@gmail.com>
+MinJae Kwon <k239507@gmail.com>
+Nathan Edwards <etaoinshrdluwho@gmail.com>
+Nick Potts <nick@the-potts.com>
+Nils Wogatzky <odog@netcologne.de>
+Olivier Wulveryck <olivier.wulveryck@gmail.com>
+Or Rikon <rikonor@gmail.com>
+Patricio Whittingslow <graded.sp@gmail.com>
+Patrick DeVivo <patrick@tickgit.com>
+Pontus Melke <pontusmelke@gmail.com>
+Renee French
+Rishi Desai <desai.rishi1@gmail.com>
+Robin Eklind <r.eklind.87@gmail.com>
+Roger Welin <roger.welin@icloud.com>
+Roman Werpachowski <roman.werpachowski@gmail.com>
+Rondall Jones <rejones7@gmail.com>
+Sam Zaydel <szaydel@gmail.com>
+Samuel Kelemen <Samuel@Kelemen.us>
+Saran Ahluwalia <ahlusar.ahluwalia@gmail.com>
+Scott Holden <scott@sshconnection.com>
+Scott Kiesel <kiesel.scott@gmail.com>
+Sebastien Binet <seb.binet@gmail.com>
+Shawn Smith <shawnpsmith@gmail.com>
+Spencer Lyon <spencerlyon2@gmail.com>
+Steve McCoy <mccoyst@gmail.com>
+Taesu Pyo <pyotaesu@gmail.com>
+Takeshi Yoneda <cz.rk.t0415y.g@gmail.com>
+Tamir Hyman <hyman.tamir@gmail.com>
+Thomas Berg <tomfuture@gmail.com>
+Tobin Harding <me@tobin.cc>
+Valentin Deleplace <deleplace2015@gmail.com>
+Vincent Thiery <vjmthiery@gmail.com>
+Vladimír Chalupecký <vladimir.chalupecky@gmail.com>
+Will Tekulve <tekulve.will@gmail.com>
+Yasuhiro Matsumoto <mattn.jp@gmail.com>
+Yevgeniy Vahlis <evahlis@gmail.com>
+Yucheng Zhu <zyctc000@gmail.com>
+Yunomi <ynmtywn@gmail.com>
+Zoe Juozapaitis
--- a/vendor/gonum.org/v1/gonum/LICENSE
+++ b/vendor/gonum.org/v1/gonum/LICENSE
@@ -0,0 +1,23 @@
+Copyright ©2013 The Gonum Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of the Gonum project nor the names of its authors and
+      contributors may be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/vendor/gonum.org/v1/gonum/blas/README.md
+++ b/vendor/gonum.org/v1/gonum/blas/README.md
@@ -0,0 +1,51 @@
+# Gonum BLAS
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/blas)](https://pkg.go.dev/gonum.org/v1/gonum/blas)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/blas?status.svg)](https://godocs.io/gonum.org/v1/gonum/blas)
+
+A collection of packages to provide BLAS functionality for the [Go programming
+language](http://golang.org)
+
+## Installation
+```sh
+  go get gonum.org/v1/gonum/blas/...
+```
+
+## Packages
+
+### blas
+
+Defines [BLAS API](http://www.netlib.org/blas/blast-forum/cinterface.pdf) split in several
+interfaces.
+
+### blas/gonum
+
+Go implementation of the BLAS API (incomplete, implements the `float32` and `float64` API).
+
+### blas/blas64 and blas/blas32
+
+Wrappers for an implementation of the double (i.e., `float64`) and single (`float32`)
+precision real parts of the BLAS API.
+
+```Go
+package main
+
+import (
+	"fmt"
+
+	"gonum.org/v1/gonum/blas/blas64"
+)
+
+func main() {
+	v := blas64.Vector{Inc: 1, Data: []float64{1, 1, 1}}
+	v.N = len(v.Data)
+	fmt.Println("v has length:", blas64.Nrm2(v))
+}
+```
+
+### blas/cblas128 and blas/cblas64
+
+Wrappers for an implementation of the double (i.e., `complex128`) and single (`complex64`) 
+precision complex parts of the blas API.
+
+Currently blas/cblas64 and blas/cblas128 require gonum.org/v1/netlib/blas.
--- a/vendor/gonum.org/v1/gonum/blas/blas.go
+++ b/vendor/gonum.org/v1/gonum/blas/blas.go
@@ -0,0 +1,283 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate ./conversions.bash
+
+package blas
+
+// Flag constants indicate Givens transformation H matrix state.
+type Flag int
+
+const (
+	Identity    Flag = -2 // H is the identity matrix; no rotation is needed.
+	Rescaling   Flag = -1 // H specifies rescaling.
+	OffDiagonal Flag = 0  // Off-diagonal elements of H are non-unit.
+	Diagonal    Flag = 1  // Diagonal elements of H are non-unit.
+)
+
+// SrotmParams contains Givens transformation parameters returned
+// by the Float32 Srotm method.
+type SrotmParams struct {
+	Flag
+	H [4]float32 // Column-major 2 by 2 matrix.
+}
+
+// DrotmParams contains Givens transformation parameters returned
+// by the Float64 Drotm method.
+type DrotmParams struct {
+	Flag
+	H [4]float64 // Column-major 2 by 2 matrix.
+}
+
+// Transpose specifies the transposition operation of a matrix.
+type Transpose byte
+
+const (
+	NoTrans   Transpose = 'N'
+	Trans     Transpose = 'T'
+	ConjTrans Transpose = 'C'
+)
+
+// Uplo specifies whether a matrix is upper or lower triangular.
+type Uplo byte
+
+const (
+	Upper Uplo = 'U'
+	Lower Uplo = 'L'
+	All   Uplo = 'A'
+)
+
+// Diag specifies whether a matrix is unit triangular.
+type Diag byte
+
+const (
+	NonUnit Diag = 'N'
+	Unit    Diag = 'U'
+)
+
+// Side specifies from which side a multiplication operation is performed.
+type Side byte
+
+const (
+	Left  Side = 'L'
+	Right Side = 'R'
+)
+
+// Float32 implements the single precision real BLAS routines.
+type Float32 interface {
+	Float32Level1
+	Float32Level2
+	Float32Level3
+}
+
+// Float32Level1 implements the single precision real BLAS Level 1 routines.
+type Float32Level1 interface {
+	Sdsdot(n int, alpha float32, x []float32, incX int, y []float32, incY int) float32
+	Dsdot(n int, x []float32, incX int, y []float32, incY int) float64
+	Sdot(n int, x []float32, incX int, y []float32, incY int) float32
+	Snrm2(n int, x []float32, incX int) float32
+	Sasum(n int, x []float32, incX int) float32
+	Isamax(n int, x []float32, incX int) int
+	Sswap(n int, x []float32, incX int, y []float32, incY int)
+	Scopy(n int, x []float32, incX int, y []float32, incY int)
+	Saxpy(n int, alpha float32, x []float32, incX int, y []float32, incY int)
+	Srotg(a, b float32) (c, s, r, z float32)
+	Srotmg(d1, d2, b1, b2 float32) (p SrotmParams, rd1, rd2, rb1 float32)
+	Srot(n int, x []float32, incX int, y []float32, incY int, c, s float32)
+	Srotm(n int, x []float32, incX int, y []float32, incY int, p SrotmParams)
+	Sscal(n int, alpha float32, x []float32, incX int)
+}
+
+// Float32Level2 implements the single precision real BLAS Level 2 routines.
+type Float32Level2 interface {
+	Sgemv(tA Transpose, m, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Sgbmv(tA Transpose, m, n, kL, kU int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Strmv(ul Uplo, tA Transpose, d Diag, n int, a []float32, lda int, x []float32, incX int)
+	Stbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []float32, lda int, x []float32, incX int)
+	Stpmv(ul Uplo, tA Transpose, d Diag, n int, ap []float32, x []float32, incX int)
+	Strsv(ul Uplo, tA Transpose, d Diag, n int, a []float32, lda int, x []float32, incX int)
+	Stbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []float32, lda int, x []float32, incX int)
+	Stpsv(ul Uplo, tA Transpose, d Diag, n int, ap []float32, x []float32, incX int)
+	Ssymv(ul Uplo, n int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Ssbmv(ul Uplo, n, k int, alpha float32, a []float32, lda int, x []float32, incX int, beta float32, y []float32, incY int)
+	Sspmv(ul Uplo, n int, alpha float32, ap []float32, x []float32, incX int, beta float32, y []float32, incY int)
+	Sger(m, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int)
+	Ssyr(ul Uplo, n int, alpha float32, x []float32, incX int, a []float32, lda int)
+	Sspr(ul Uplo, n int, alpha float32, x []float32, incX int, ap []float32)
+	Ssyr2(ul Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32, lda int)
+	Sspr2(ul Uplo, n int, alpha float32, x []float32, incX int, y []float32, incY int, a []float32)
+}
+
+// Float32Level3 implements the single precision real BLAS Level 3 routines.
+type Float32Level3 interface {
+	Sgemm(tA, tB Transpose, m, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Ssymm(s Side, ul Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Ssyrk(ul Uplo, t Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int)
+	Ssyr2k(ul Uplo, t Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int)
+	Strmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int)
+	Strsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int)
+}
+
+// Float64 implements the single precision real BLAS routines.
+type Float64 interface {
+	Float64Level1
+	Float64Level2
+	Float64Level3
+}
+
+// Float64Level1 implements the double precision real BLAS Level 1 routines.
+type Float64Level1 interface {
+	Ddot(n int, x []float64, incX int, y []float64, incY int) float64
+	Dnrm2(n int, x []float64, incX int) float64
+	Dasum(n int, x []float64, incX int) float64
+	Idamax(n int, x []float64, incX int) int
+	Dswap(n int, x []float64, incX int, y []float64, incY int)
+	Dcopy(n int, x []float64, incX int, y []float64, incY int)
+	Daxpy(n int, alpha float64, x []float64, incX int, y []float64, incY int)
+	Drotg(a, b float64) (c, s, r, z float64)
+	Drotmg(d1, d2, b1, b2 float64) (p DrotmParams, rd1, rd2, rb1 float64)
+	Drot(n int, x []float64, incX int, y []float64, incY int, c float64, s float64)
+	Drotm(n int, x []float64, incX int, y []float64, incY int, p DrotmParams)
+	Dscal(n int, alpha float64, x []float64, incX int)
+}
+
+// Float64Level2 implements the double precision real BLAS Level 2 routines.
+type Float64Level2 interface {
+	Dgemv(tA Transpose, m, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dgbmv(tA Transpose, m, n, kL, kU int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dtrmv(ul Uplo, tA Transpose, d Diag, n int, a []float64, lda int, x []float64, incX int)
+	Dtbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []float64, lda int, x []float64, incX int)
+	Dtpmv(ul Uplo, tA Transpose, d Diag, n int, ap []float64, x []float64, incX int)
+	Dtrsv(ul Uplo, tA Transpose, d Diag, n int, a []float64, lda int, x []float64, incX int)
+	Dtbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []float64, lda int, x []float64, incX int)
+	Dtpsv(ul Uplo, tA Transpose, d Diag, n int, ap []float64, x []float64, incX int)
+	Dsymv(ul Uplo, n int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dsbmv(ul Uplo, n, k int, alpha float64, a []float64, lda int, x []float64, incX int, beta float64, y []float64, incY int)
+	Dspmv(ul Uplo, n int, alpha float64, ap []float64, x []float64, incX int, beta float64, y []float64, incY int)
+	Dger(m, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int)
+	Dsyr(ul Uplo, n int, alpha float64, x []float64, incX int, a []float64, lda int)
+	Dspr(ul Uplo, n int, alpha float64, x []float64, incX int, ap []float64)
+	Dsyr2(ul Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64, lda int)
+	Dspr2(ul Uplo, n int, alpha float64, x []float64, incX int, y []float64, incY int, a []float64)
+}
+
+// Float64Level3 implements the double precision real BLAS Level 3 routines.
+type Float64Level3 interface {
+	Dgemm(tA, tB Transpose, m, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dsymm(s Side, ul Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dsyrk(ul Uplo, t Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int)
+	Dsyr2k(ul Uplo, t Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int)
+	Dtrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int)
+	Dtrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int)
+}
+
+// Complex64 implements the single precision complex BLAS routines.
+type Complex64 interface {
+	Complex64Level1
+	Complex64Level2
+	Complex64Level3
+}
+
+// Complex64Level1 implements the single precision complex BLAS Level 1 routines.
+type Complex64Level1 interface {
+	Cdotu(n int, x []complex64, incX int, y []complex64, incY int) (dotu complex64)
+	Cdotc(n int, x []complex64, incX int, y []complex64, incY int) (dotc complex64)
+	Scnrm2(n int, x []complex64, incX int) float32
+	Scasum(n int, x []complex64, incX int) float32
+	Icamax(n int, x []complex64, incX int) int
+	Cswap(n int, x []complex64, incX int, y []complex64, incY int)
+	Ccopy(n int, x []complex64, incX int, y []complex64, incY int)
+	Caxpy(n int, alpha complex64, x []complex64, incX int, y []complex64, incY int)
+	Cscal(n int, alpha complex64, x []complex64, incX int)
+	Csscal(n int, alpha float32, x []complex64, incX int)
+}
+
+// Complex64Level2 implements the single precision complex BLAS routines Level 2 routines.
+type Complex64Level2 interface {
+	Cgemv(tA Transpose, m, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Cgbmv(tA Transpose, m, n, kL, kU int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Ctrmv(ul Uplo, tA Transpose, d Diag, n int, a []complex64, lda int, x []complex64, incX int)
+	Ctbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex64, lda int, x []complex64, incX int)
+	Ctpmv(ul Uplo, tA Transpose, d Diag, n int, ap []complex64, x []complex64, incX int)
+	Ctrsv(ul Uplo, tA Transpose, d Diag, n int, a []complex64, lda int, x []complex64, incX int)
+	Ctbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex64, lda int, x []complex64, incX int)
+	Ctpsv(ul Uplo, tA Transpose, d Diag, n int, ap []complex64, x []complex64, incX int)
+	Chemv(ul Uplo, n int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Chbmv(ul Uplo, n, k int, alpha complex64, a []complex64, lda int, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Chpmv(ul Uplo, n int, alpha complex64, ap []complex64, x []complex64, incX int, beta complex64, y []complex64, incY int)
+	Cgeru(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Cgerc(m, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Cher(ul Uplo, n int, alpha float32, x []complex64, incX int, a []complex64, lda int)
+	Chpr(ul Uplo, n int, alpha float32, x []complex64, incX int, a []complex64)
+	Cher2(ul Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, a []complex64, lda int)
+	Chpr2(ul Uplo, n int, alpha complex64, x []complex64, incX int, y []complex64, incY int, ap []complex64)
+}
+
+// Complex64Level3 implements the single precision complex BLAS Level 3 routines.
+type Complex64Level3 interface {
+	Cgemm(tA, tB Transpose, m, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Csymm(s Side, ul Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Csyrk(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, beta complex64, c []complex64, ldc int)
+	Csyr2k(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Ctrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int)
+	Ctrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int)
+	Chemm(s Side, ul Uplo, m, n int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta complex64, c []complex64, ldc int)
+	Cherk(ul Uplo, t Transpose, n, k int, alpha float32, a []complex64, lda int, beta float32, c []complex64, ldc int)
+	Cher2k(ul Uplo, t Transpose, n, k int, alpha complex64, a []complex64, lda int, b []complex64, ldb int, beta float32, c []complex64, ldc int)
+}
+
+// Complex128 implements the double precision complex BLAS routines.
+type Complex128 interface {
+	Complex128Level1
+	Complex128Level2
+	Complex128Level3
+}
+
+// Complex128Level1 implements the double precision complex BLAS Level 1 routines.
+type Complex128Level1 interface {
+	Zdotu(n int, x []complex128, incX int, y []complex128, incY int) (dotu complex128)
+	Zdotc(n int, x []complex128, incX int, y []complex128, incY int) (dotc complex128)
+	Dznrm2(n int, x []complex128, incX int) float64
+	Dzasum(n int, x []complex128, incX int) float64
+	Izamax(n int, x []complex128, incX int) int
+	Zswap(n int, x []complex128, incX int, y []complex128, incY int)
+	Zcopy(n int, x []complex128, incX int, y []complex128, incY int)
+	Zaxpy(n int, alpha complex128, x []complex128, incX int, y []complex128, incY int)
+	Zscal(n int, alpha complex128, x []complex128, incX int)
+	Zdscal(n int, alpha float64, x []complex128, incX int)
+}
+
+// Complex128Level2 implements the double precision complex BLAS Level 2 routines.
+type Complex128Level2 interface {
+	Zgemv(tA Transpose, m, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zgbmv(tA Transpose, m, n int, kL int, kU int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Ztrmv(ul Uplo, tA Transpose, d Diag, n int, a []complex128, lda int, x []complex128, incX int)
+	Ztbmv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex128, lda int, x []complex128, incX int)
+	Ztpmv(ul Uplo, tA Transpose, d Diag, n int, ap []complex128, x []complex128, incX int)
+	Ztrsv(ul Uplo, tA Transpose, d Diag, n int, a []complex128, lda int, x []complex128, incX int)
+	Ztbsv(ul Uplo, tA Transpose, d Diag, n, k int, a []complex128, lda int, x []complex128, incX int)
+	Ztpsv(ul Uplo, tA Transpose, d Diag, n int, ap []complex128, x []complex128, incX int)
+	Zhemv(ul Uplo, n int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zhbmv(ul Uplo, n, k int, alpha complex128, a []complex128, lda int, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zhpmv(ul Uplo, n int, alpha complex128, ap []complex128, x []complex128, incX int, beta complex128, y []complex128, incY int)
+	Zgeru(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zgerc(m, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zher(ul Uplo, n int, alpha float64, x []complex128, incX int, a []complex128, lda int)
+	Zhpr(ul Uplo, n int, alpha float64, x []complex128, incX int, a []complex128)
+	Zher2(ul Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, a []complex128, lda int)
+	Zhpr2(ul Uplo, n int, alpha complex128, x []complex128, incX int, y []complex128, incY int, ap []complex128)
+}
+
+// Complex128Level3 implements the double precision complex BLAS Level 3 routines.
+type Complex128Level3 interface {
+	Zgemm(tA, tB Transpose, m, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zsymm(s Side, ul Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zsyrk(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, beta complex128, c []complex128, ldc int)
+	Zsyr2k(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Ztrmm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int)
+	Ztrsm(s Side, ul Uplo, tA Transpose, d Diag, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int)
+	Zhemm(s Side, ul Uplo, m, n int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta complex128, c []complex128, ldc int)
+	Zherk(ul Uplo, t Transpose, n, k int, alpha float64, a []complex128, lda int, beta float64, c []complex128, ldc int)
+	Zher2k(ul Uplo, t Transpose, n, k int, alpha complex128, a []complex128, lda int, b []complex128, ldb int, beta float64, c []complex128, ldc int)
+}
--- a/vendor/gonum.org/v1/gonum/blas/blas64/blas64.go
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/blas64.go
@@ -0,0 +1,533 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/gonum"
+)
+
+var blas64 blas.Float64 = gonum.Implementation{}
+
+// Use sets the BLAS float64 implementation to be used by subsequent BLAS calls.
+// The default implementation is
+// gonum.org/v1/gonum/blas/gonum.Implementation.
+func Use(b blas.Float64) {
+	blas64 = b
+}
+
+// Implementation returns the current BLAS float64 implementation.
+//
+// Implementation allows direct calls to the current BLAS float64 implementation
+// giving finer control of parameters.
+func Implementation() blas.Float64 {
+	return blas64
+}
+
+// Vector represents a vector with an associated element increment.
+type Vector struct {
+	N    int
+	Data []float64
+	Inc  int
+}
+
+// General represents a matrix using the conventional storage scheme.
+type General struct {
+	Rows, Cols int
+	Data       []float64
+	Stride     int
+}
+
+// Band represents a band matrix using the band storage scheme.
+type Band struct {
+	Rows, Cols int
+	KL, KU     int
+	Data       []float64
+	Stride     int
+}
+
+// Triangular represents a triangular matrix using the conventional storage scheme.
+type Triangular struct {
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+	N      int
+	Data   []float64
+	Stride int
+}
+
+// TriangularBand represents a triangular matrix using the band storage scheme.
+type TriangularBand struct {
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+	N, K   int
+	Data   []float64
+	Stride int
+}
+
+// TriangularPacked represents a triangular matrix using the packed storage scheme.
+type TriangularPacked struct {
+	Uplo blas.Uplo
+	Diag blas.Diag
+	N    int
+	Data []float64
+}
+
+// Symmetric represents a symmetric matrix using the conventional storage scheme.
+type Symmetric struct {
+	Uplo   blas.Uplo
+	N      int
+	Data   []float64
+	Stride int
+}
+
+// SymmetricBand represents a symmetric matrix using the band storage scheme.
+type SymmetricBand struct {
+	Uplo   blas.Uplo
+	N, K   int
+	Data   []float64
+	Stride int
+}
+
+// SymmetricPacked represents a symmetric matrix using the packed storage scheme.
+type SymmetricPacked struct {
+	Uplo blas.Uplo
+	N    int
+	Data []float64
+}
+
+// Level 1
+
+const (
+	negInc    = "blas64: negative vector increment"
+	badLength = "blas64: vector length mismatch"
+)
+
+// Dot computes the dot product of the two vectors:
+//
+//	\sum_i x[i]*y[i].
+//
+// Dot will panic if the lengths of x and y do not match.
+func Dot(x, y Vector) float64 {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	return blas64.Ddot(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Nrm2 computes the Euclidean norm of the vector x:
+//
+//	sqrt(\sum_i x[i]*x[i]).
+//
+// Nrm2 will panic if the vector increment is negative.
+func Nrm2(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Dnrm2(x.N, x.Data, x.Inc)
+}
+
+// Asum computes the sum of the absolute values of the elements of x:
+//
+//	\sum_i |x[i]|.
+//
+// Asum will panic if the vector increment is negative.
+func Asum(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Dasum(x.N, x.Data, x.Inc)
+}
+
+// Iamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Iamax returns -1 if n == 0.
+//
+// Iamax will panic if the vector increment is negative.
+func Iamax(x Vector) int {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return blas64.Idamax(x.N, x.Data, x.Inc)
+}
+
+// Swap exchanges the elements of the two vectors:
+//
+//	x[i], y[i] = y[i], x[i] for all i.
+//
+// Swap will panic if the lengths of x and y do not match.
+func Swap(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Dswap(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Copy copies the elements of x into the elements of y:
+//
+//	y[i] = x[i] for all i.
+//
+// Copy will panic if the lengths of x and y do not match.
+func Copy(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Dcopy(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Axpy adds x scaled by alpha to y:
+//
+//	y[i] += alpha*x[i] for all i.
+//
+// Axpy will panic if the lengths of x and y do not match.
+func Axpy(alpha float64, x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Daxpy(x.N, alpha, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Rotg computes the parameters of a Givens plane rotation so that
+//
+//	⎡ c s⎤   ⎡a⎤   ⎡r⎤
+//	⎣-s c⎦ * ⎣b⎦ = ⎣0⎦
+//
+// where a and b are the Cartesian coordinates of a given point.
+// c, s, and r are defined as
+//
+//	r = ±Sqrt(a^2 + b^2),
+//	c = a/r, the cosine of the rotation angle,
+//	s = a/r, the sine of the rotation angle,
+//
+// and z is defined such that
+//
+//	if |a| > |b|,        z = s,
+//	otherwise if c != 0, z = 1/c,
+//	otherwise            z = 1.
+func Rotg(a, b float64) (c, s, r, z float64) {
+	return blas64.Drotg(a, b)
+}
+
+// Rotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+func Rotmg(d1, d2, b1, b2 float64) (p blas.DrotmParams, rd1, rd2, rb1 float64) {
+	return blas64.Drotmg(d1, d2, b1, b2)
+}
+
+// Rot applies a plane transformation to n points represented by the vectors x
+// and y:
+//
+//	x[i] =  c*x[i] + s*y[i],
+//	y[i] = -s*x[i] + c*y[i], for all i.
+func Rot(x, y Vector, c, s float64) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Drot(x.N, x.Data, x.Inc, y.Data, y.Inc, c, s)
+}
+
+// Rotm applies the modified Givens rotation to n points represented by the
+// vectors x and y.
+func Rotm(x, y Vector, p blas.DrotmParams) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	blas64.Drotm(x.N, x.Data, x.Inc, y.Data, y.Inc, p)
+}
+
+// Scal scales the vector x by alpha:
+//
+//	x[i] *= alpha for all i.
+//
+// Scal will panic if the vector increment is negative.
+func Scal(alpha float64, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	blas64.Dscal(x.N, alpha, x.Data, x.Inc)
+}
+
+// Level 2
+
+// Gemv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func Gemv(t blas.Transpose, alpha float64, a General, x Vector, beta float64, y Vector) {
+	blas64.Dgemv(t, a.Rows, a.Cols, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Gbmv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an m×n band matrix, x and y are vectors, and alpha and beta are scalars.
+func Gbmv(t blas.Transpose, alpha float64, a Band, x Vector, beta float64, y Vector) {
+	blas64.Dgbmv(t, a.Rows, a.Cols, a.KL, a.KU, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Trmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix, and x is a vector.
+func Trmv(t blas.Transpose, a Triangular, x Vector) {
+	blas64.Dtrmv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x is a vector.
+func Tbmv(t blas.Transpose, a TriangularBand, x Vector) {
+	blas64.Dtbmv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+func Tpmv(t blas.Transpose, a TriangularPacked, x Vector) {
+	blas64.Dtpmv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Trsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Trsv(t blas.Transpose, a Triangular, x Vector) {
+	blas64.Dtrsv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x and b are vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tbsv(t blas.Transpose, a TriangularBand, x Vector) {
+	blas64.Dtbsv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans or blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format, and x and b are
+// vectors.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tpsv(t blas.Transpose, a TriangularPacked, x Vector) {
+	blas64.Dtpsv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Symv computes
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n symmetric matrix, x and y are vectors, and alpha and
+// beta are scalars.
+func Symv(alpha float64, a Symmetric, x Vector, beta float64, y Vector) {
+	blas64.Dsymv(a.Uplo, a.N, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Sbmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n symmetric band matrix, x and y are vectors, and alpha
+// and beta are scalars.
+func Sbmv(alpha float64, a SymmetricBand, x Vector, beta float64, y Vector) {
+	blas64.Dsbmv(a.Uplo, a.N, a.K, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Spmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+func Spmv(alpha float64, a SymmetricPacked, x Vector, beta float64, y Vector) {
+	blas64.Dspmv(a.Uplo, a.N, alpha, a.Data, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Ger performs a rank-1 update
+//
+//	A += alpha * x * yᵀ,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(alpha float64, x, y Vector, a General) {
+	blas64.Dger(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Syr performs a rank-1 update
+//
+//	A += alpha * x * xᵀ,
+//
+// where A is an n×n symmetric matrix, x is a vector, and alpha is a scalar.
+func Syr(alpha float64, x Vector, a Symmetric) {
+	blas64.Dsyr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data, a.Stride)
+}
+
+// Spr performs the rank-1 update
+//
+//	A += alpha * x * xᵀ,
+//
+// where A is an n×n symmetric matrix in packed format, x is a vector, and
+// alpha is a scalar.
+func Spr(alpha float64, x Vector, a SymmetricPacked) {
+	blas64.Dspr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data)
+}
+
+// Syr2 performs a rank-2 update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ,
+//
+// where A is a symmetric n×n matrix, x and y are vectors, and alpha is a scalar.
+func Syr2(alpha float64, x, y Vector, a Symmetric) {
+	blas64.Dsyr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Spr2 performs a rank-2 update
+//
+//	A += alpha * x * yᵀ + alpha * y * xᵀ,
+//
+// where A is an n×n symmetric matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+func Spr2(alpha float64, x, y Vector, a SymmetricPacked) {
+	blas64.Dspr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data)
+}
+
+// Level 3
+
+// Gemm computes
+//
+//	C = alpha * A * B + beta * C,
+//
+// where A, B, and C are dense matrices, and alpha and beta are scalars.
+// tA and tB specify whether A or B are transposed.
+func Gemm(tA, tB blas.Transpose, alpha float64, a, b General, beta float64, c General) {
+	var m, n, k int
+	if tA == blas.NoTrans {
+		m, k = a.Rows, a.Cols
+	} else {
+		m, k = a.Cols, a.Rows
+	}
+	if tB == blas.NoTrans {
+		n = b.Cols
+	} else {
+		n = b.Rows
+	}
+	blas64.Dgemm(tA, tB, m, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Symm performs
+//
+//	C = alpha * A * B + beta * C  if s == blas.Left,
+//	C = alpha * B * A + beta * C  if s == blas.Right,
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and
+// alpha is a scalar.
+func Symm(s blas.Side, alpha float64, a Symmetric, b General, beta float64, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	blas64.Dsymm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Syrk performs a symmetric rank-k update
+//
+//	C = alpha * A * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * A + beta * C  if t == blas.Trans or blas.ConjTrans,
+//
+// where C is an n×n symmetric matrix, A is an n×k matrix if t == blas.NoTrans and
+// a k×n matrix otherwise, and alpha and beta are scalars.
+func Syrk(t blas.Transpose, alpha float64, a General, beta float64, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	blas64.Dsyrk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Syr2k performs a symmetric rank-2k update
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if t == blas.Trans or blas.ConjTrans,
+//
+// where C is an n×n symmetric matrix, A and B are n×k matrices if t == NoTrans
+// and k×n matrices otherwise, and alpha and beta are scalars.
+func Syr2k(t blas.Transpose, alpha float64, a, b General, beta float64, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	blas64.Dsyr2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Trmm performs
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and s == blas.Left,
+//	B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and s == blas.Left,
+//	B = alpha * B * A   if tA == blas.NoTrans and s == blas.Right,
+//	B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is
+// a scalar.
+func Trmm(s blas.Side, tA blas.Transpose, alpha float64, a Triangular, b General) {
+	blas64.Dtrmm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Trsm solves
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and s == blas.Left,
+//	Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and s == blas.Left,
+//	X * A = alpha * B   if tA == blas.NoTrans and s == blas.Right,
+//	X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and
+// alpha is a scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+func Trsm(s blas.Side, tA blas.Transpose, alpha float64, a Triangular, b General) {
+	blas64.Dtrsm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
--- a/vendor/gonum.org/v1/gonum/blas/blas64/conv.go
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/conv.go
@@ -0,0 +1,277 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import "gonum.org/v1/gonum/blas"
+
+// GeneralCols represents a matrix using the conventional column-major storage scheme.
+type GeneralCols General
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t GeneralCols) From(a General) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if len(t.Data) < (t.Cols-1)*t.Stride+t.Rows {
+		panic("blas64: short data slice")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j, v := range a.Data[i*a.Stride : i*a.Stride+a.Cols] {
+			t.Data[i+j*t.Stride] = v
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t General) From(a GeneralCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if len(t.Data) < (t.Rows-1)*t.Stride+t.Cols {
+		panic("blas64: short data slice")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i, v := range a.Data[j*a.Stride : j*a.Stride+a.Rows] {
+			t.Data[i*t.Stride+j] = v
+		}
+	}
+}
+
+// TriangularCols represents a matrix using the conventional column-major storage scheme.
+type TriangularCols Triangular
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t TriangularCols) From(a Triangular) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t Triangular) From(a TriangularCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// BandCols represents a matrix using the band column-major storage scheme.
+type BandCols Band
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t BandCols) From(a Band) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("blas64: short stride for destination")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j := max(0, i-a.KL); j < min(i+a.KU+1, a.Cols); j++ {
+			t.Data[i+t.KU-j+j*t.Stride] = a.Data[j+a.KL-i+i*a.Stride]
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t Band) From(a BandCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("blas64: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("blas64: short stride for destination")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i := max(0, j-a.KU); i < min(j+a.KL+1, a.Rows); i++ {
+			t.Data[j+a.KL-i+i*a.Stride] = a.Data[i+t.KU-j+j*t.Stride]
+		}
+	}
+}
+
+// TriangularBandCols represents a triangular matrix using the band column-major storage scheme.
+type TriangularBandCols TriangularBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBandCols) From(a TriangularBand) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBand) From(a TriangularBandCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("blas64: mismatched BLAS diag")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
--- a/vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/conv_symmetric.go
@@ -0,0 +1,153 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package blas64
+
+import "gonum.org/v1/gonum/blas"
+
+// SymmetricCols represents a matrix using the conventional column-major storage scheme.
+type SymmetricCols Symmetric
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t SymmetricCols) From(a Symmetric) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Symmetric) From(a SymmetricCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// SymmetricBandCols represents a symmetric matrix using the band column-major storage scheme.
+type SymmetricBandCols SymmetricBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBandCols) From(a SymmetricBand) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBand) From(a SymmetricBandCols) {
+	if t.N != a.N {
+		panic("blas64: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("blas64: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("blas64: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("blas64: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("blas64: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("blas64: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
--- a/vendor/gonum.org/v1/gonum/blas/blas64/doc.go
+++ b/vendor/gonum.org/v1/gonum/blas/blas64/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package blas64 provides a simple interface to the float64 BLAS API.
+package blas64 // import "gonum.org/v1/gonum/blas/blas64"
--- a/vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/cblas128.go
@@ -0,0 +1,600 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/blas/gonum"
+)
+
+var cblas128 blas.Complex128 = gonum.Implementation{}
+
+// Use sets the BLAS complex128 implementation to be used by subsequent BLAS calls.
+// The default implementation is
+// gonum.org/v1/gonum/blas/gonum.Implementation.
+func Use(b blas.Complex128) {
+	cblas128 = b
+}
+
+// Implementation returns the current BLAS complex128 implementation.
+//
+// Implementation allows direct calls to the current the BLAS complex128 implementation
+// giving finer control of parameters.
+func Implementation() blas.Complex128 {
+	return cblas128
+}
+
+// Vector represents a vector with an associated element increment.
+type Vector struct {
+	N    int
+	Inc  int
+	Data []complex128
+}
+
+// General represents a matrix using the conventional storage scheme.
+type General struct {
+	Rows, Cols int
+	Stride     int
+	Data       []complex128
+}
+
+// Band represents a band matrix using the band storage scheme.
+type Band struct {
+	Rows, Cols int
+	KL, KU     int
+	Stride     int
+	Data       []complex128
+}
+
+// Triangular represents a triangular matrix using the conventional storage scheme.
+type Triangular struct {
+	N      int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+}
+
+// TriangularBand represents a triangular matrix using the band storage scheme.
+type TriangularBand struct {
+	N, K   int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+	Diag   blas.Diag
+}
+
+// TriangularPacked represents a triangular matrix using the packed storage scheme.
+type TriangularPacked struct {
+	N    int
+	Data []complex128
+	Uplo blas.Uplo
+	Diag blas.Diag
+}
+
+// Symmetric represents a symmetric matrix using the conventional storage scheme.
+type Symmetric struct {
+	N      int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+}
+
+// SymmetricBand represents a symmetric matrix using the band storage scheme.
+type SymmetricBand struct {
+	N, K   int
+	Stride int
+	Data   []complex128
+	Uplo   blas.Uplo
+}
+
+// SymmetricPacked represents a symmetric matrix using the packed storage scheme.
+type SymmetricPacked struct {
+	N    int
+	Data []complex128
+	Uplo blas.Uplo
+}
+
+// Hermitian represents an Hermitian matrix using the conventional storage scheme.
+type Hermitian Symmetric
+
+// HermitianBand represents an Hermitian matrix using the band storage scheme.
+type HermitianBand SymmetricBand
+
+// HermitianPacked represents an Hermitian matrix using the packed storage scheme.
+type HermitianPacked SymmetricPacked
+
+// Level 1
+
+const (
+	negInc    = "cblas128: negative vector increment"
+	badLength = "cblas128: vector length mismatch"
+)
+
+// Dotu computes the dot product of the two vectors without
+// complex conjugation:
+//
+//	xᵀ * y.
+//
+// Dotu will panic if the lengths of x and y do not match.
+func Dotu(x, y Vector) complex128 {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	return cblas128.Zdotu(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Dotc computes the dot product of the two vectors with
+// complex conjugation:
+//
+//	xᴴ * y.
+//
+// Dotc will panic if the lengths of x and y do not match.
+func Dotc(x, y Vector) complex128 {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	return cblas128.Zdotc(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Nrm2 computes the Euclidean norm of the vector x:
+//
+//	sqrt(\sum_i x[i] * x[i]).
+//
+// Nrm2 will panic if the vector increment is negative.
+func Nrm2(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Dznrm2(x.N, x.Data, x.Inc)
+}
+
+// Asum computes the sum of magnitudes of the real and imaginary parts of
+// elements of the vector x:
+//
+//	\sum_i (|Re x[i]| + |Im x[i]|).
+//
+// Asum will panic if the vector increment is negative.
+func Asum(x Vector) float64 {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Dzasum(x.N, x.Data, x.Inc)
+}
+
+// Iamax returns the index of an element of x with the largest sum of
+// magnitudes of the real and imaginary parts (|Re x[i]|+|Im x[i]|).
+// If there are multiple such indices, the earliest is returned.
+//
+// Iamax returns -1 if n == 0.
+//
+// Iamax will panic if the vector increment is negative.
+func Iamax(x Vector) int {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	return cblas128.Izamax(x.N, x.Data, x.Inc)
+}
+
+// Swap exchanges the elements of two vectors:
+//
+//	x[i], y[i] = y[i], x[i] for all i.
+//
+// Swap will panic if the lengths of x and y do not match.
+func Swap(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	cblas128.Zswap(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Copy copies the elements of x into the elements of y:
+//
+//	y[i] = x[i] for all i.
+//
+// Copy will panic if the lengths of x and y do not match.
+func Copy(x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	cblas128.Zcopy(x.N, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Axpy computes
+//
+//	y = alpha * x + y,
+//
+// where x and y are vectors, and alpha is a scalar.
+// Axpy will panic if the lengths of x and y do not match.
+func Axpy(alpha complex128, x, y Vector) {
+	if x.N != y.N {
+		panic(badLength)
+	}
+	cblas128.Zaxpy(x.N, alpha, x.Data, x.Inc, y.Data, y.Inc)
+}
+
+// Scal computes
+//
+//	x = alpha * x,
+//
+// where x is a vector, and alpha is a scalar.
+//
+// Scal will panic if the vector increment is negative.
+func Scal(alpha complex128, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	cblas128.Zscal(x.N, alpha, x.Data, x.Inc)
+}
+
+// Dscal computes
+//
+//	x = alpha * x,
+//
+// where x is a vector, and alpha is a real scalar.
+//
+// Dscal will panic if the vector increment is negative.
+func Dscal(alpha float64, x Vector) {
+	if x.Inc < 0 {
+		panic(negInc)
+	}
+	cblas128.Zdscal(x.N, alpha, x.Data, x.Inc)
+}
+
+// Level 2
+
+// Gemv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans,
+//	y = alpha * Aᴴ * x + beta * y  if t == blas.ConjTrans,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are
+// scalars.
+func Gemv(t blas.Transpose, alpha complex128, a General, x Vector, beta complex128, y Vector) {
+	cblas128.Zgemv(t, a.Rows, a.Cols, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Gbmv computes
+//
+//	y = alpha * A * x + beta * y   if t == blas.NoTrans,
+//	y = alpha * Aᵀ * x + beta * y  if t == blas.Trans,
+//	y = alpha * Aᴴ * x + beta * y  if t == blas.ConjTrans,
+//
+// where A is an m×n band matrix, x and y are vectors, and alpha and beta are
+// scalars.
+func Gbmv(t blas.Transpose, alpha complex128, a Band, x Vector, beta complex128, y Vector) {
+	cblas128.Zgbmv(t, a.Rows, a.Cols, a.KL, a.KU, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Trmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans,
+//	x = Aᴴ * x  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix, and x is a vector.
+func Trmv(t blas.Transpose, a Triangular, x Vector) {
+	cblas128.Ztrmv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans,
+//	x = Aᴴ * x  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x is a vector.
+func Tbmv(t blas.Transpose, a TriangularBand, x Vector) {
+	cblas128.Ztbmv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpmv computes
+//
+//	x = A * x   if t == blas.NoTrans,
+//	x = Aᵀ * x  if t == blas.Trans,
+//	x = Aᴴ * x  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format, and x is a vector.
+func Tpmv(t blas.Transpose, a TriangularPacked, x Vector) {
+	cblas128.Ztpmv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Trsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans,
+//	Aᴴ * x = b  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Trsv(t blas.Transpose, a Triangular, x Vector) {
+	cblas128.Ztrsv(a.Uplo, t, a.Diag, a.N, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tbsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans,
+//	Aᴴ * x = b  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular band matrix, and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tbsv(t blas.Transpose, a TriangularBand, x Vector) {
+	cblas128.Ztbsv(a.Uplo, t, a.Diag, a.N, a.K, a.Data, a.Stride, x.Data, x.Inc)
+}
+
+// Tpsv solves
+//
+//	A * x = b   if t == blas.NoTrans,
+//	Aᵀ * x = b  if t == blas.Trans,
+//	Aᴴ * x = b  if t == blas.ConjTrans,
+//
+// where A is an n×n triangular matrix in packed format and x is a vector.
+//
+// At entry to the function, x contains the values of b, and the result is
+// stored in-place into x.
+//
+// No test for singularity or near-singularity is included in this
+// routine. Such tests must be performed before calling this routine.
+func Tpsv(t blas.Transpose, a TriangularPacked, x Vector) {
+	cblas128.Ztpsv(a.Uplo, t, a.Diag, a.N, a.Data, x.Data, x.Inc)
+}
+
+// Hemv computes
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n Hermitian matrix, x and y are vectors, and alpha and
+// beta are scalars.
+func Hemv(alpha complex128, a Hermitian, x Vector, beta complex128, y Vector) {
+	cblas128.Zhemv(a.Uplo, a.N, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Hbmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n Hermitian band matrix, x and y are vectors, and alpha
+// and beta are scalars.
+func Hbmv(alpha complex128, a HermitianBand, x Vector, beta complex128, y Vector) {
+	cblas128.Zhbmv(a.Uplo, a.N, a.K, alpha, a.Data, a.Stride, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Hpmv performs
+//
+//	y = alpha * A * x + beta * y,
+//
+// where A is an n×n Hermitian matrix in packed format, x and y are vectors,
+// and alpha and beta are scalars.
+func Hpmv(alpha complex128, a HermitianPacked, x Vector, beta complex128, y Vector) {
+	cblas128.Zhpmv(a.Uplo, a.N, alpha, a.Data, x.Data, x.Inc, beta, y.Data, y.Inc)
+}
+
+// Geru performs a rank-1 update
+//
+//	A += alpha * x * yᵀ,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Geru(alpha complex128, x, y Vector, a General) {
+	cblas128.Zgeru(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Gerc performs a rank-1 update
+//
+//	A += alpha * x * yᴴ,
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Gerc(alpha complex128, x, y Vector, a General) {
+	cblas128.Zgerc(a.Rows, a.Cols, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Her performs a rank-1 update
+//
+//	A += alpha * x * yᵀ,
+//
+// where A is an m×n Hermitian matrix, x and y are vectors, and alpha is a scalar.
+func Her(alpha float64, x Vector, a Hermitian) {
+	cblas128.Zher(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data, a.Stride)
+}
+
+// Hpr performs a rank-1 update
+//
+//	A += alpha * x * xᴴ,
+//
+// where A is an n×n Hermitian matrix in packed format, x is a vector, and
+// alpha is a scalar.
+func Hpr(alpha float64, x Vector, a HermitianPacked) {
+	cblas128.Zhpr(a.Uplo, a.N, alpha, x.Data, x.Inc, a.Data)
+}
+
+// Her2 performs a rank-2 update
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ,
+//
+// where A is an n×n Hermitian matrix, x and y are vectors, and alpha is a scalar.
+func Her2(alpha complex128, x, y Vector, a Hermitian) {
+	cblas128.Zher2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data, a.Stride)
+}
+
+// Hpr2 performs a rank-2 update
+//
+//	A += alpha * x * yᴴ + conj(alpha) * y * xᴴ,
+//
+// where A is an n×n Hermitian matrix in packed format, x and y are vectors,
+// and alpha is a scalar.
+func Hpr2(alpha complex128, x, y Vector, a HermitianPacked) {
+	cblas128.Zhpr2(a.Uplo, a.N, alpha, x.Data, x.Inc, y.Data, y.Inc, a.Data)
+}
+
+// Level 3
+
+// Gemm computes
+//
+//	C = alpha * A * B + beta * C,
+//
+// where A, B, and C are dense matrices, and alpha and beta are scalars.
+// tA and tB specify whether A or B are transposed or conjugated.
+func Gemm(tA, tB blas.Transpose, alpha complex128, a, b General, beta complex128, c General) {
+	var m, n, k int
+	if tA == blas.NoTrans {
+		m, k = a.Rows, a.Cols
+	} else {
+		m, k = a.Cols, a.Rows
+	}
+	if tB == blas.NoTrans {
+		n = b.Cols
+	} else {
+		n = b.Rows
+	}
+	cblas128.Zgemm(tA, tB, m, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Symm performs
+//
+//	C = alpha * A * B + beta * C  if s == blas.Left,
+//	C = alpha * B * A + beta * C  if s == blas.Right,
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and
+// alpha and beta are scalars.
+func Symm(s blas.Side, alpha complex128, a Symmetric, b General, beta complex128, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	cblas128.Zsymm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Syrk performs a symmetric rank-k update
+//
+//	C = alpha * A * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * A + beta * C  if t == blas.Trans,
+//
+// where C is an n×n symmetric matrix, A is an n×k matrix if t == blas.NoTrans
+// and a k×n matrix otherwise, and alpha and beta are scalars.
+func Syrk(t blas.Transpose, alpha complex128, a General, beta complex128, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zsyrk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Syr2k performs a symmetric rank-2k update
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if t == blas.Trans,
+//
+// where C is an n×n symmetric matrix, A and B are n×k matrices if
+// t == blas.NoTrans and k×n otherwise, and alpha and beta are scalars.
+func Syr2k(t blas.Transpose, alpha complex128, a, b General, beta complex128, c Symmetric) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zsyr2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Trmm performs
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and s == blas.Left,
+//	B = alpha * Aᵀ * B  if tA == blas.Trans and s == blas.Left,
+//	B = alpha * Aᴴ * B  if tA == blas.ConjTrans and s == blas.Left,
+//	B = alpha * B * A   if tA == blas.NoTrans and s == blas.Right,
+//	B = alpha * B * Aᵀ  if tA == blas.Trans and s == blas.Right,
+//	B = alpha * B * Aᴴ  if tA == blas.ConjTrans and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is
+// a scalar.
+func Trmm(s blas.Side, tA blas.Transpose, alpha complex128, a Triangular, b General) {
+	cblas128.Ztrmm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Trsm solves
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and s == blas.Left,
+//	Aᵀ * X = alpha * B  if tA == blas.Trans and s == blas.Left,
+//	Aᴴ * X = alpha * B  if tA == blas.ConjTrans and s == blas.Left,
+//	X * A = alpha * B   if tA == blas.NoTrans and s == blas.Right,
+//	X * Aᵀ = alpha * B  if tA == blas.Trans and s == blas.Right,
+//	X * Aᴴ = alpha * B  if tA == blas.ConjTrans and s == blas.Right,
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and
+// alpha is a scalar.
+//
+// At entry to the function, b contains the values of B, and the result is
+// stored in-place into b.
+//
+// No check is made that A is invertible.
+func Trsm(s blas.Side, tA blas.Transpose, alpha complex128, a Triangular, b General) {
+	cblas128.Ztrsm(s, a.Uplo, tA, a.Diag, b.Rows, b.Cols, alpha, a.Data, a.Stride, b.Data, b.Stride)
+}
+
+// Hemm performs
+//
+//	C = alpha * A * B + beta * C  if s == blas.Left,
+//	C = alpha * B * A + beta * C  if s == blas.Right,
+//
+// where A is an n×n or m×m Hermitian matrix, B and C are m×n matrices, and
+// alpha and beta are scalars.
+func Hemm(s blas.Side, alpha complex128, a Hermitian, b General, beta complex128, c General) {
+	var m, n int
+	if s == blas.Left {
+		m, n = a.N, b.Cols
+	} else {
+		m, n = b.Rows, a.N
+	}
+	cblas128.Zhemm(s, a.Uplo, m, n, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
+
+// Herk performs the Hermitian rank-k update
+//
+//	C = alpha * A * Aᴴ + beta*C  if t == blas.NoTrans,
+//	C = alpha * Aᴴ * A + beta*C  if t == blas.ConjTrans,
+//
+// where C is an n×n Hermitian matrix, A is an n×k matrix if t == blas.NoTrans
+// and a k×n matrix otherwise, and alpha and beta are scalars.
+func Herk(t blas.Transpose, alpha float64, a General, beta float64, c Hermitian) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zherk(c.Uplo, t, n, k, alpha, a.Data, a.Stride, beta, c.Data, c.Stride)
+}
+
+// Her2k performs the Hermitian rank-2k update
+//
+//	C = alpha * A * Bᴴ + conj(alpha) * B * Aᴴ + beta * C  if t == blas.NoTrans,
+//	C = alpha * Aᴴ * B + conj(alpha) * Bᴴ * A + beta * C  if t == blas.ConjTrans,
+//
+// where C is an n×n Hermitian matrix, A and B are n×k matrices if t == NoTrans
+// and k×n matrices otherwise, and alpha and beta are scalars.
+func Her2k(t blas.Transpose, alpha complex128, a, b General, beta float64, c Hermitian) {
+	var n, k int
+	if t == blas.NoTrans {
+		n, k = a.Rows, a.Cols
+	} else {
+		n, k = a.Cols, a.Rows
+	}
+	cblas128.Zher2k(c.Uplo, t, n, k, alpha, a.Data, a.Stride, b.Data, b.Stride, beta, c.Data, c.Stride)
+}
--- a/vendor/gonum.org/v1/gonum/blas/cblas128/conv.go
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv.go
@@ -0,0 +1,279 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// GeneralCols represents a matrix using the conventional column-major storage scheme.
+type GeneralCols General
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t GeneralCols) From(a General) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if len(t.Data) < (t.Cols-1)*t.Stride+t.Rows {
+		panic("cblas128: short data slice")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j, v := range a.Data[i*a.Stride : i*a.Stride+a.Cols] {
+			t.Data[i+j*t.Stride] = v
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions as a and have adequate backing
+// data storage.
+func (t General) From(a GeneralCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if len(t.Data) < (t.Rows-1)*t.Stride+t.Cols {
+		panic("cblas128: short data slice")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i, v := range a.Data[j*a.Stride : j*a.Stride+a.Rows] {
+			t.Data[i*t.Stride+j] = v
+		}
+	}
+}
+
+// TriangularCols represents a matrix using the conventional column-major storage scheme.
+type TriangularCols Triangular
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t TriangularCols) From(a Triangular) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, uplo and diag as a and have
+// adequate backing data storage.
+func (t Triangular) From(a TriangularCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.All:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// BandCols represents a matrix using the band column-major storage scheme.
+type BandCols Band
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t BandCols) From(a Band) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("cblas128: short stride for destination")
+	}
+	for i := 0; i < a.Rows; i++ {
+		for j := max(0, i-a.KL); j < min(i+a.KU+1, a.Cols); j++ {
+			t.Data[i+t.KU-j+j*t.Stride] = a.Data[j+a.KL-i+i*a.Stride]
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and bandwidth as a and have
+// adequate backing data storage.
+func (t Band) From(a BandCols) {
+	if t.Rows != a.Rows || t.Cols != a.Cols {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.KL != a.KL || t.KU != a.KU {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.KL+a.KU+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.KL+t.KU+1 {
+		panic("cblas128: short stride for destination")
+	}
+	for j := 0; j < a.Cols; j++ {
+		for i := max(0, j-a.KU); i < min(j+a.KL+1, a.Rows); i++ {
+			t.Data[j+a.KL-i+i*a.Stride] = a.Data[i+t.KU-j+j*t.Stride]
+		}
+	}
+}
+
+// TriangularBandCols represents a triangular matrix using the band column-major storage scheme.
+type TriangularBandCols TriangularBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBandCols) From(a TriangularBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t TriangularBand) From(a TriangularBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	if t.Diag != a.Diag {
+		panic("cblas128: mismatched BLAS diag")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
--- a/vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_hermitian.go
@@ -0,0 +1,155 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// HermitianCols represents a matrix using the conventional column-major storage scheme.
+type HermitianCols Hermitian
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t HermitianCols) From(a Hermitian) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Hermitian) From(a HermitianCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// HermitianBandCols represents an Hermitian matrix using the band column-major storage scheme.
+type HermitianBandCols HermitianBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t HermitianBandCols) From(a HermitianBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t HermitianBand) From(a HermitianBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
--- a/vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/conv_symmetric.go
@@ -0,0 +1,155 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package cblas128
+
+import "gonum.org/v1/gonum/blas"
+
+// SymmetricCols represents a matrix using the conventional column-major storage scheme.
+type SymmetricCols Symmetric
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t SymmetricCols) From(a Symmetric) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i+j*t.Stride] = a.Data[i*a.Stride+j]
+			}
+		}
+	}
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions and uplo as a and have adequate
+// backing data storage.
+func (t Symmetric) From(a SymmetricCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		for i := 0; i < a.N; i++ {
+			for j := i; j < a.N; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	case blas.Lower:
+		for i := 0; i < a.N; i++ {
+			for j := 0; j <= i; j++ {
+				t.Data[i*t.Stride+j] = a.Data[i+j*a.Stride]
+			}
+		}
+	}
+}
+
+// SymmetricBandCols represents a symmetric matrix using the band column-major storage scheme.
+type SymmetricBandCols SymmetricBand
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBandCols) From(a SymmetricBand) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := BandCols{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := Band{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
+
+// From fills the receiver with elements from a. The receiver
+// must have the same dimensions, bandwidth and uplo as a and
+// have adequate backing data storage.
+func (t SymmetricBand) From(a SymmetricBandCols) {
+	if t.N != a.N {
+		panic("cblas128: mismatched dimension")
+	}
+	if t.K != a.K {
+		panic("cblas128: mismatched bandwidth")
+	}
+	if a.Stride < a.K+1 {
+		panic("cblas128: short stride for source")
+	}
+	if t.Stride < t.K+1 {
+		panic("cblas128: short stride for destination")
+	}
+	if t.Uplo != a.Uplo {
+		panic("cblas128: mismatched BLAS uplo")
+	}
+	dst := Band{
+		Rows: t.N, Cols: t.N,
+		Stride: t.Stride,
+		Data:   t.Data,
+	}
+	src := BandCols{
+		Rows: a.N, Cols: a.N,
+		Stride: a.Stride,
+		Data:   a.Data,
+	}
+	switch a.Uplo {
+	default:
+		panic("cblas128: bad BLAS uplo")
+	case blas.Upper:
+		dst.KU = t.K
+		src.KU = a.K
+	case blas.Lower:
+		dst.KL = t.K
+		src.KL = a.K
+	}
+	dst.From(src)
+}
--- a/vendor/gonum.org/v1/gonum/blas/cblas128/doc.go
+++ b/vendor/gonum.org/v1/gonum/blas/cblas128/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package cblas128 provides a simple interface to the complex128 BLAS API.
+package cblas128 // import "gonum.org/v1/gonum/blas/cblas128"
--- a/vendor/gonum.org/v1/gonum/blas/conversions.bash
+++ b/vendor/gonum.org/v1/gonum/blas/conversions.bash
@@ -0,0 +1,159 @@
+#!/usr/bin/env bash
+
+# Copyright ©2017 The Gonum Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+# Generate code for blas32.
+echo Generating blas32/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+\
+>> blas32/conv.go
+
+echo Generating blas32/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+\
+>> blas32/conv_test.go
+
+echo Generating blas32/conv_symmetric.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_symmetric.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+\
+>> blas32/conv_symmetric.go
+
+echo Generating blas32/conv_symmetric_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > blas32/conv_symmetric_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> float32' \
+\
+| sed -e 's/blas64/blas32/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+\
+>> blas32/conv_symmetric_test.go
+
+
+# Generate code for cblas128.
+echo Generating cblas128/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+\
+>> cblas128/conv.go
+
+echo Generating cblas128/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_test.go
+
+echo Generating cblas128/conv_symmetric.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_symmetric.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+\
+>> cblas128/conv_symmetric.go
+
+echo Generating cblas128/conv_symmetric_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_symmetric_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_symmetric_test.go
+
+echo Generating cblas128/conv_hermitian.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_hermitian.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+\
+>> cblas128/conv_hermitian.go
+
+echo Generating cblas128/conv_hermitian_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas128/conv_hermitian_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex128' \
+\
+| sed -e 's/blas64/cblas128/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+      -e 's_"math"_math "math/cmplx"_' \
+\
+>> cblas128/conv_hermitian_test.go
+
+
+# Generate code for cblas64.
+echo Generating cblas64/conv.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv.go
+cat blas64/conv.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+\
+>> cblas64/conv.go
+
+echo Generating cblas64/conv_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_test.go
+cat blas64/conv_test.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/cmplx64"_' \
+\
+>> cblas64/conv_test.go
+
+echo Generating cblas64/conv_hermitian.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_hermitian.go
+cat blas64/conv_symmetric.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+\
+>> cblas64/conv_hermitian.go
+
+echo Generating cblas64/conv_hermitian_test.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas”; DO NOT EDIT.\n' > cblas64/conv_hermitian_test.go
+cat blas64/conv_symmetric_test.go \
+| gofmt -r 'float64 -> complex64' \
+\
+| sed -e 's/blas64/cblas64/' \
+      -e 's/Symmetric/Hermitian/g' \
+      -e 's/a symmetric/an Hermitian/g' \
+      -e 's/symmetric/hermitian/g' \
+      -e 's/Sym/Herm/g' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/cmplx64"_' \
+\
+>> cblas64/conv_hermitian_test.go
--- a/vendor/gonum.org/v1/gonum/blas/doc.go
+++ b/vendor/gonum.org/v1/gonum/blas/doc.go
@@ -0,0 +1,108 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package blas provides interfaces for the BLAS linear algebra standard.
+
+All methods must perform appropriate parameter checking and panic if
+provided parameters that do not conform to the requirements specified
+by the BLAS standard.
+
+Quick Reference Guide to the BLAS from http://www.netlib.org/lapack/lug/node145.html
+
+This version is modified to remove the "order" option. All matrix operations are
+on row-order matrices.
+
+Level 1 BLAS
+
+	        dim scalar vector   vector   scalars              5-element prefixes
+	                                                          struct
+
+	_rotg (                                      a, b )                S, D
+	_rotmg(                              d1, d2, a, b )                S, D
+	_rot  ( n,         x, incX, y, incY,               c, s )          S, D
+	_rotm ( n,         x, incX, y, incY,                      param )  S, D
+	_swap ( n,         x, incX, y, incY )                              S, D, C, Z
+	_scal ( n,  alpha, x, incX )                                       S, D, C, Z, Cs, Zd
+	_copy ( n,         x, incX, y, incY )                              S, D, C, Z
+	_axpy ( n,  alpha, x, incX, y, incY )                              S, D, C, Z
+	_dot  ( n,         x, incX, y, incY )                              S, D, Ds
+	_dotu ( n,         x, incX, y, incY )                              C, Z
+	_dotc ( n,         x, incX, y, incY )                              C, Z
+	__dot ( n,  alpha, x, incX, y, incY )                              Sds
+	_nrm2 ( n,         x, incX )                                       S, D, Sc, Dz
+	_asum ( n,         x, incX )                                       S, D, Sc, Dz
+	I_amax( n,         x, incX )                                       s, d, c, z
+
+Level 2 BLAS
+
+	        options                   dim   b-width scalar matrix  vector   scalar vector   prefixes
+
+	_gemv (        trans,      m, n,         alpha, a, lda, x, incX, beta,  y, incY ) S, D, C, Z
+	_gbmv (        trans,      m, n, kL, kU, alpha, a, lda, x, incX, beta,  y, incY ) S, D, C, Z
+	_hemv ( uplo,                 n,         alpha, a, lda, x, incX, beta,  y, incY ) C, Z
+	_hbmv ( uplo,                 n, k,      alpha, a, lda, x, incX, beta,  y, incY ) C, Z
+	_hpmv ( uplo,                 n,         alpha, ap,     x, incX, beta,  y, incY ) C, Z
+	_symv ( uplo,                 n,         alpha, a, lda, x, incX, beta,  y, incY ) S, D
+	_sbmv ( uplo,                 n, k,      alpha, a, lda, x, incX, beta,  y, incY ) S, D
+	_spmv ( uplo,                 n,         alpha, ap,     x, incX, beta,  y, incY ) S, D
+	_trmv ( uplo, trans, diag,    n,                a, lda, x, incX )                 S, D, C, Z
+	_tbmv ( uplo, trans, diag,    n, k,             a, lda, x, incX )                 S, D, C, Z
+	_tpmv ( uplo, trans, diag,    n,                ap,     x, incX )                 S, D, C, Z
+	_trsv ( uplo, trans, diag,    n,                a, lda, x, incX )                 S, D, C, Z
+	_tbsv ( uplo, trans, diag,    n, k,             a, lda, x, incX )                 S, D, C, Z
+	_tpsv ( uplo, trans, diag,    n,                ap,     x, incX )                 S, D, C, Z
+
+	        options                   dim   scalar vector   vector   matrix  prefixes
+
+	_ger  (                    m, n, alpha, x, incX, y, incY, a, lda ) S, D
+	_geru (                    m, n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_gerc (                    m, n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_her  ( uplo,                 n, alpha, x, incX,          a, lda ) C, Z
+	_hpr  ( uplo,                 n, alpha, x, incX,          ap )     C, Z
+	_her2 ( uplo,                 n, alpha, x, incX, y, incY, a, lda ) C, Z
+	_hpr2 ( uplo,                 n, alpha, x, incX, y, incY, ap )     C, Z
+	_syr  ( uplo,                 n, alpha, x, incX,          a, lda ) S, D
+	_spr  ( uplo,                 n, alpha, x, incX,          ap )     S, D
+	_syr2 ( uplo,                 n, alpha, x, incX, y, incY, a, lda ) S, D
+	_spr2 ( uplo,                 n, alpha, x, incX, y, incY, ap )     S, D
+
+Level 3 BLAS
+
+	        options                                 dim      scalar matrix  matrix  scalar matrix  prefixes
+
+	_gemm (             transA, transB,      m, n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_symm ( side, uplo,                      m, n,    alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_hemm ( side, uplo,                      m, n,    alpha, a, lda, b, ldb, beta,  c, ldc ) C, Z
+	_syrk (       uplo, trans,                  n, k, alpha, a, lda,         beta,  c, ldc ) S, D, C, Z
+	_herk (       uplo, trans,                  n, k, alpha, a, lda,         beta,  c, ldc ) C, Z
+	_syr2k(       uplo, trans,                  n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) S, D, C, Z
+	_her2k(       uplo, trans,                  n, k, alpha, a, lda, b, ldb, beta,  c, ldc ) C, Z
+	_trmm ( side, uplo, transA,        diag, m, n,    alpha, a, lda, b, ldb )                S, D, C, Z
+	_trsm ( side, uplo, transA,        diag, m, n,    alpha, a, lda, b, ldb )                S, D, C, Z
+
+Meaning of prefixes
+
+	S - float32	C - complex64
+	D - float64	Z - complex128
+
+Matrix types
+
+	GE - GEneral 		GB - General Band
+	SY - SYmmetric 		SB - Symmetric Band 	SP - Symmetric Packed
+	HE - HErmitian 		HB - Hermitian Band 	HP - Hermitian Packed
+	TR - TRiangular 	TB - Triangular Band 	TP - Triangular Packed
+
+Options
+
+	trans 	= NoTrans, Trans, ConjTrans
+	uplo 	= Upper, Lower
+	diag 	= Nonunit, Unit
+	side 	= Left, Right (A or op(A) on the left, or A or op(A) on the right)
+
+For real matrices, Trans and ConjTrans have the same meaning.
+For Hermitian matrices, trans = Trans is not allowed.
+For complex symmetric matrices, trans = ConjTrans is not allowed.
+*/
+package blas // import "gonum.org/v1/gonum/blas"
--- a/vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/dgemm.go
@@ -0,0 +1,297 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"runtime"
+	"sync"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// Dgemm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C
+//	C = alpha * Aᵀ * B + beta * C
+//	C = alpha * A * Bᵀ + beta * C
+//	C = alpha * Aᵀ * Bᵀ + beta * C
+//
+// where A is an m×k or k×m dense matrix, B is an n×k or k×n dense matrix, C is
+// an m×n matrix, and alpha and beta are scalars. tA and tB specify whether A or
+// B are transposed.
+func (Implementation) Dgemm(tA, tB blas.Transpose, m, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	aTrans := tA == blas.Trans || tA == blas.ConjTrans
+	if aTrans {
+		if lda < max(1, m) {
+			panic(badLdA)
+		}
+	} else {
+		if lda < max(1, k) {
+			panic(badLdA)
+		}
+	}
+	bTrans := tB == blas.Trans || tB == blas.ConjTrans
+	if bTrans {
+		if ldb < max(1, k) {
+			panic(badLdB)
+		}
+	} else {
+		if ldb < max(1, n) {
+			panic(badLdB)
+		}
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if aTrans {
+		if len(a) < (k-1)*lda+m {
+			panic(shortA)
+		}
+	} else {
+		if len(a) < (m-1)*lda+k {
+			panic(shortA)
+		}
+	}
+	if bTrans {
+		if len(b) < (n-1)*ldb+k {
+			panic(shortB)
+		}
+	} else {
+		if len(b) < (k-1)*ldb+n {
+			panic(shortB)
+		}
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	// scale c
+	if beta != 1 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+		}
+	}
+
+	dgemmParallel(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+}
+
+func dgemmParallel(aTrans, bTrans bool, m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// dgemmParallel computes a parallel matrix multiplication by partitioning
+	// a and b into sub-blocks, and updating c with the multiplication of the sub-block
+	// In all cases,
+	// A = [ 	A_11	A_12 ... 	A_1j
+	//			A_21	A_22 ...	A_2j
+	//				...
+	//			A_i1	A_i2 ...	A_ij]
+	//
+	// and same for B. All of the submatrix sizes are blockSize×blockSize except
+	// at the edges.
+	//
+	// In all cases, there is one dimension for each matrix along which
+	// C must be updated sequentially.
+	// Cij = \sum_k Aik Bki,	(A * B)
+	// Cij = \sum_k Aki Bkj,	(Aᵀ * B)
+	// Cij = \sum_k Aik Bjk,	(A * Bᵀ)
+	// Cij = \sum_k Aki Bjk,	(Aᵀ * Bᵀ)
+	//
+	// This code computes one {i, j} block sequentially along the k dimension,
+	// and computes all of the {i, j} blocks concurrently. This
+	// partitioning allows Cij to be updated in-place without race-conditions.
+	// Instead of launching a goroutine for each possible concurrent computation,
+	// a number of worker goroutines are created and channels are used to pass
+	// available and completed cases.
+	//
+	// http://alexkr.com/docs/matrixmult.pdf is a good reference on matrix-matrix
+	// multiplies, though this code does not copy matrices to attempt to eliminate
+	// cache misses.
+
+	maxKLen := k
+	parBlocks := blocks(m, blockSize) * blocks(n, blockSize)
+	if parBlocks < minParBlock {
+		// The matrix multiplication is small in the dimensions where it can be
+		// computed concurrently. Just do it in serial.
+		dgemmSerial(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	}
+
+	// workerLimit acts a number of maximum concurrent workers,
+	// with the limit set to the number of procs available.
+	workerLimit := make(chan struct{}, runtime.GOMAXPROCS(0))
+
+	// wg is used to wait for all
+	var wg sync.WaitGroup
+	wg.Add(parBlocks)
+	defer wg.Wait()
+
+	for i := 0; i < m; i += blockSize {
+		for j := 0; j < n; j += blockSize {
+			workerLimit <- struct{}{}
+			go func(i, j int) {
+				defer func() {
+					wg.Done()
+					<-workerLimit
+				}()
+
+				leni := blockSize
+				if i+leni > m {
+					leni = m - i
+				}
+				lenj := blockSize
+				if j+lenj > n {
+					lenj = n - j
+				}
+
+				cSub := sliceView64(c, ldc, i, j, leni, lenj)
+
+				// Compute A_ik B_kj for all k
+				for k := 0; k < maxKLen; k += blockSize {
+					lenk := blockSize
+					if k+lenk > maxKLen {
+						lenk = maxKLen - k
+					}
+					var aSub, bSub []float64
+					if aTrans {
+						aSub = sliceView64(a, lda, k, i, lenk, leni)
+					} else {
+						aSub = sliceView64(a, lda, i, k, leni, lenk)
+					}
+					if bTrans {
+						bSub = sliceView64(b, ldb, j, k, lenj, lenk)
+					} else {
+						bSub = sliceView64(b, ldb, k, j, lenk, lenj)
+					}
+					dgemmSerial(aTrans, bTrans, leni, lenj, lenk, aSub, lda, bSub, ldb, cSub, ldc, alpha)
+				}
+			}(i, j)
+		}
+	}
+}
+
+// dgemmSerial is serial matrix multiply
+func dgemmSerial(aTrans, bTrans bool, m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	switch {
+	case !aTrans && !bTrans:
+		dgemmSerialNotNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && !bTrans:
+		dgemmSerialTransNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case !aTrans && bTrans:
+		dgemmSerialNotTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && bTrans:
+		dgemmSerialTransTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	default:
+		panic("unreachable")
+	}
+}
+
+// dgemmSerial where neither a nor b are transposed
+func dgemmSerialNotNot(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		ctmp := c[i*ldc : i*ldc+n]
+		for l, v := range a[i*lda : i*lda+k] {
+			tmp := alpha * v
+			if tmp != 0 {
+				f64.AxpyUnitary(tmp, b[l*ldb:l*ldb+n], ctmp)
+			}
+		}
+	}
+}
+
+// dgemmSerial where neither a is transposed and b is not
+func dgemmSerialTransNot(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		btmp := b[l*ldb : l*ldb+n]
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f64.AxpyUnitary(tmp, btmp, ctmp)
+			}
+		}
+	}
+}
+
+// dgemmSerial where neither a is not transposed and b is
+func dgemmSerialNotTrans(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		atmp := a[i*lda : i*lda+k]
+		ctmp := c[i*ldc : i*ldc+n]
+		for j := 0; j < n; j++ {
+			ctmp[j] += alpha * f64.DotUnitary(atmp, b[j*ldb:j*ldb+k])
+		}
+	}
+}
+
+// dgemmSerial where both are transposed
+func dgemmSerialTransTrans(m, n, k int, a []float64, lda int, b []float64, ldb int, c []float64, ldc int, alpha float64) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f64.AxpyInc(tmp, b[l:], ctmp, uintptr(n), uintptr(ldb), 1, 0, 0)
+			}
+		}
+	}
+}
+
+func sliceView64(a []float64, lda, i, j, r, c int) []float64 {
+	return a[i*lda+j : (i+r-1)*lda+j+c]
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/doc.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/doc.go
@@ -0,0 +1,99 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Ensure changes made to blas/native are reflected in blas/cgo where relevant.
+
+/*
+Package gonum is a Go implementation of the BLAS API. This implementation
+panics when the input arguments are invalid as per the standard, for example
+if a vector increment is zero. Note that the treatment of NaN values
+is not specified, and differs among the BLAS implementations.
+gonum.org/v1/gonum/blas/blas64 provides helpful wrapper functions to the BLAS
+interface. The rest of this text describes the layout of the data for the input types.
+
+Note that in the function documentation, x[i] refers to the i^th element
+of the vector, which will be different from the i^th element of the slice if
+incX != 1.
+
+See http://www.netlib.org/lapack/explore-html/d4/de1/_l_i_c_e_n_s_e_source.html
+for more license information.
+
+Vector arguments are effectively strided slices. They have two input arguments,
+a number of elements, n, and an increment, incX. The increment specifies the
+distance between elements of the vector. The actual Go slice may be longer
+than necessary.
+The increment may be positive or negative, except in functions with only
+a single vector argument where the increment may only be positive. If the increment
+is negative, s[0] is the last element in the slice. Note that this is not the same
+as counting backward from the end of the slice, as len(s) may be longer than
+necessary. So, for example, if n = 5 and incX = 3, the elements of s are
+
+	[0 * * 1 * * 2 * * 3 * * 4 * * * ...]
+
+where ∗ elements are never accessed. If incX = -3, the same elements are
+accessed, just in reverse order (4, 3, 2, 1, 0).
+
+Dense matrices are specified by a number of rows, a number of columns, and a stride.
+The stride specifies the number of entries in the slice between the first element
+of successive rows. The stride must be at least as large as the number of columns
+but may be longer.
+
+	[a00 ... a0n a0* ... a1stride-1 a21 ... amn am* ... amstride-1]
+
+Thus, dense[i*ld + j] refers to the {i, j}th element of the matrix.
+
+Symmetric and triangular matrices (non-packed) are stored identically to Dense,
+except that only elements in one triangle of the matrix are accessed.
+
+Packed symmetric and packed triangular matrices are laid out with the entries
+condensed such that all of the unreferenced elements are removed. So, the upper triangular
+matrix
+
+	[
+	  1  2  3
+	  0  4  5
+	  0  0  6
+	]
+
+and the lower-triangular matrix
+
+	[
+	  1  0  0
+	  2  3  0
+	  4  5  6
+	]
+
+will both be compacted as [1 2 3 4 5 6]. The (i, j) element of the original
+dense matrix can be found at element i*n - (i-1)*i/2 + j for upper triangular,
+and at element i * (i+1) /2 + j for lower triangular.
+
+Banded matrices are laid out in a compact format, constructed by removing the
+zeros in the rows and aligning the diagonals. For example, the matrix
+
+	[
+	  1  2  3  0  0  0
+	  4  5  6  7  0  0
+	  0  8  9 10 11  0
+	  0  0 12 13 14 15
+	  0  0  0 16 17 18
+	  0  0  0  0 19 20
+	]
+
+implicitly becomes (∗ entries are never accessed)
+
+	[
+	   *  1  2  3
+	   4  5  6  7
+	   8  9 10 11
+	  12 13 14 15
+	  16 17 18  *
+	  19 20  *  *
+	]
+
+which is given to the BLAS routine as [∗ 1 2 3 4 ...].
+
+See http://www.crest.iu.edu/research/mtl/reference/html/banded.html
+for more information
+*/
+package gonum // import "gonum.org/v1/gonum/blas/gonum"
--- a/vendor/gonum.org/v1/gonum/blas/gonum/errors.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/errors.go
@@ -0,0 +1,35 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+// Panic strings used during parameter checks.
+// This list is duplicated in netlib/blas/netlib. Keep in sync.
+const (
+	zeroIncX = "blas: zero x index increment"
+	zeroIncY = "blas: zero y index increment"
+
+	mLT0  = "blas: m < 0"
+	nLT0  = "blas: n < 0"
+	kLT0  = "blas: k < 0"
+	kLLT0 = "blas: kL < 0"
+	kULT0 = "blas: kU < 0"
+
+	badUplo      = "blas: illegal triangle"
+	badTranspose = "blas: illegal transpose"
+	badDiag      = "blas: illegal diagonal"
+	badSide      = "blas: illegal side"
+	badFlag      = "blas: illegal rotm flag"
+
+	badLdA = "blas: bad leading dimension of A"
+	badLdB = "blas: bad leading dimension of B"
+	badLdC = "blas: bad leading dimension of C"
+
+	shortX  = "blas: insufficient length of x"
+	shortY  = "blas: insufficient length of y"
+	shortAP = "blas: insufficient length of ap"
+	shortA  = "blas: insufficient length of a"
+	shortB  = "blas: insufficient length of b"
+	shortC  = "blas: insufficient length of c"
+)
--- a/vendor/gonum.org/v1/gonum/blas/gonum/gonum.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/gonum.go
@@ -0,0 +1,52 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate ./single_precision.bash
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/internal/math32"
+)
+
+type Implementation struct{}
+
+// [SD]gemm behavior constants. These are kept here to keep them out of the
+// way during single precision code generation.
+const (
+	blockSize   = 64 // b x b matrix
+	minParBlock = 4  // minimum number of blocks needed to go parallel
+)
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+func min(a, b int) int {
+	if a > b {
+		return b
+	}
+	return a
+}
+
+// blocks returns the number of divisions of the dimension length with the given
+// block size.
+func blocks(dim, bsize int) int {
+	return (dim + bsize - 1) / bsize
+}
+
+// dcabs1 returns |real(z)|+|imag(z)|.
+func dcabs1(z complex128) float64 {
+	return math.Abs(real(z)) + math.Abs(imag(z))
+}
+
+// scabs1 returns |real(z)|+|imag(z)|.
+func scabs1(z complex64) float32 {
+	return math32.Abs(real(z)) + math32.Abs(imag(z))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx128.go
@@ -0,0 +1,454 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c128"
+)
+
+var _ blas.Complex128Level1 = Implementation{}
+
+// Dzasum returns the sum of the absolute values of the elements of x
+//
+//	\sum_i |Re(x[i])| + |Im(x[i])|
+//
+// Dzasum returns 0 if incX is negative.
+func (Implementation) Dzasum(n int, x []complex128, incX int) float64 {
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	var sum float64
+	if incX == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		for _, v := range x[:n] {
+			sum += dcabs1(v)
+		}
+		return sum
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	for i := 0; i < n; i++ {
+		v := x[i*incX]
+		sum += dcabs1(v)
+	}
+	return sum
+}
+
+// Dznrm2 computes the Euclidean norm of the complex vector x,
+//
+//	‖x‖_2 = sqrt(\sum_i x[i] * conj(x[i])).
+//
+// This function returns 0 if incX is negative.
+func (Implementation) Dznrm2(n int, x []complex128, incX int) float64 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if n < 1 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	var (
+		scale float64
+		ssq   float64 = 1
+	)
+	if incX == 1 {
+		for _, v := range x[:n] {
+			re, im := math.Abs(real(v)), math.Abs(imag(v))
+			if re != 0 {
+				if re > scale {
+					ssq = 1 + ssq*(scale/re)*(scale/re)
+					scale = re
+				} else {
+					ssq += (re / scale) * (re / scale)
+				}
+			}
+			if im != 0 {
+				if im > scale {
+					ssq = 1 + ssq*(scale/im)*(scale/im)
+					scale = im
+				} else {
+					ssq += (im / scale) * (im / scale)
+				}
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(ssq)
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		re, im := math.Abs(real(x[ix])), math.Abs(imag(x[ix]))
+		if re != 0 {
+			if re > scale {
+				ssq = 1 + ssq*(scale/re)*(scale/re)
+				scale = re
+			} else {
+				ssq += (re / scale) * (re / scale)
+			}
+		}
+		if im != 0 {
+			if im > scale {
+				ssq = 1 + ssq*(scale/im)*(scale/im)
+				scale = im
+			} else {
+				ssq += (im / scale) * (im / scale)
+			}
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(ssq)
+}
+
+// Izamax returns the index of the first element of x having largest |Re(·)|+|Im(·)|.
+// Izamax returns -1 if n is 0 or incX is negative.
+func (Implementation) Izamax(n int, x []complex128, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		// Return invalid index.
+		return -1
+	}
+	if n < 1 {
+		if n == 0 {
+			// Return invalid index.
+			return -1
+		}
+		panic(nLT0)
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	idx := 0
+	max := dcabs1(x[0])
+	if incX == 1 {
+		for i, v := range x[1:n] {
+			absV := dcabs1(v)
+			if absV > max {
+				max = absV
+				idx = i + 1
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		absV := dcabs1(x[ix])
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Zaxpy adds alpha times x to y:
+//
+//	y[i] += alpha * x[i] for all i
+func (Implementation) Zaxpy(n int, alpha complex128, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		c128.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (1 - n) * incX
+	}
+	if incY < 0 {
+		iy = (1 - n) * incY
+	}
+	c128.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zcopy copies the vector x to vector y.
+func (Implementation) Zcopy(n int, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Zdotc computes the dot product
+//
+//	xᴴ · y
+//
+// of two complex vectors x and y.
+func (Implementation) Zdotc(n int, x []complex128, incX int, y []complex128, incY int) complex128 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c128.DotcUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c128.DotcInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zdotu computes the dot product
+//
+//	xᵀ · y
+//
+// of two complex vectors x and y.
+func (Implementation) Zdotu(n int, x []complex128, incX int, y []complex128, incY int) complex128 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c128.DotuUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c128.DotuInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Zdscal scales the vector x by a real scalar alpha.
+// Zdscal has no effect if incX < 0.
+func (Implementation) Zdscal(n int, alpha float64, x []complex128, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i] = complex(alpha*real(v), alpha*imag(v))
+		}
+		return
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		v := x[ix]
+		x[ix] = complex(alpha*real(v), alpha*imag(v))
+	}
+}
+
+// Zscal scales the vector x by a complex scalar alpha.
+// Zscal has no effect if incX < 0.
+func (Implementation) Zscal(n int, alpha complex128, x []complex128, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		c128.ScalUnitary(alpha, x[:n])
+		return
+	}
+	c128.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
+
+// Zswap exchanges the elements of two complex vectors x and y.
+func (Implementation) Zswap(n int, x []complex128, incX int, y []complex128, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1cmplx64.go
@@ -0,0 +1,476 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	math "gonum.org/v1/gonum/internal/math32"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/c64"
+)
+
+var _ blas.Complex64Level1 = Implementation{}
+
+// Scasum returns the sum of the absolute values of the elements of x
+//
+//	\sum_i |Re(x[i])| + |Im(x[i])|
+//
+// Scasum returns 0 if incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Scasum(n int, x []complex64, incX int) float32 {
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	var sum float32
+	if incX == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		for _, v := range x[:n] {
+			sum += scabs1(v)
+		}
+		return sum
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	for i := 0; i < n; i++ {
+		v := x[i*incX]
+		sum += scabs1(v)
+	}
+	return sum
+}
+
+// Scnrm2 computes the Euclidean norm of the complex vector x,
+//
+//	‖x‖_2 = sqrt(\sum_i x[i] * conj(x[i])).
+//
+// This function returns 0 if incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Scnrm2(n int, x []complex64, incX int) float32 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if n < 1 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	var (
+		scale float32
+		ssq   float32 = 1
+	)
+	if incX == 1 {
+		for _, v := range x[:n] {
+			re, im := math.Abs(real(v)), math.Abs(imag(v))
+			if re != 0 {
+				if re > scale {
+					ssq = 1 + ssq*(scale/re)*(scale/re)
+					scale = re
+				} else {
+					ssq += (re / scale) * (re / scale)
+				}
+			}
+			if im != 0 {
+				if im > scale {
+					ssq = 1 + ssq*(scale/im)*(scale/im)
+					scale = im
+				} else {
+					ssq += (im / scale) * (im / scale)
+				}
+			}
+		}
+		if math.IsInf(scale, 1) {
+			return math.Inf(1)
+		}
+		return scale * math.Sqrt(ssq)
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		re, im := math.Abs(real(x[ix])), math.Abs(imag(x[ix]))
+		if re != 0 {
+			if re > scale {
+				ssq = 1 + ssq*(scale/re)*(scale/re)
+				scale = re
+			} else {
+				ssq += (re / scale) * (re / scale)
+			}
+		}
+		if im != 0 {
+			if im > scale {
+				ssq = 1 + ssq*(scale/im)*(scale/im)
+				scale = im
+			} else {
+				ssq += (im / scale) * (im / scale)
+			}
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(ssq)
+}
+
+// Icamax returns the index of the first element of x having largest |Re(·)|+|Im(·)|.
+// Icamax returns -1 if n is 0 or incX is negative.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Icamax(n int, x []complex64, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		// Return invalid index.
+		return -1
+	}
+	if n < 1 {
+		if n == 0 {
+			// Return invalid index.
+			return -1
+		}
+		panic(nLT0)
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	idx := 0
+	max := scabs1(x[0])
+	if incX == 1 {
+		for i, v := range x[1:n] {
+			absV := scabs1(v)
+			if absV > max {
+				max = absV
+				idx = i + 1
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		absV := scabs1(x[ix])
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Caxpy adds alpha times x to y:
+//
+//	y[i] += alpha * x[i] for all i
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Caxpy(n int, alpha complex64, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		c64.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (1 - n) * incX
+	}
+	if incY < 0 {
+		iy = (1 - n) * incY
+	}
+	c64.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Ccopy copies the vector x to vector y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Ccopy(n int, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Cdotc computes the dot product
+//
+//	xᴴ · y
+//
+// of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cdotc(n int, x []complex64, incX int, y []complex64, incY int) complex64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c64.DotcUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c64.DotcInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Cdotu computes the dot product
+//
+//	xᵀ · y
+//
+// of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cdotu(n int, x []complex64, incX int, y []complex64, incY int) complex64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return c64.DotuUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || (n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return c64.DotuInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Csscal scales the vector x by a real scalar alpha.
+// Csscal has no effect if incX < 0.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Csscal(n int, alpha float32, x []complex64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i] = complex(alpha*real(v), alpha*imag(v))
+		}
+		return
+	}
+	for ix := 0; ix < n*incX; ix += incX {
+		v := x[ix]
+		x[ix] = complex(alpha*real(v), alpha*imag(v))
+	}
+}
+
+// Cscal scales the vector x by a complex scalar alpha.
+// Cscal has no effect if incX < 0.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cscal(n int, alpha complex64, x []complex64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		c64.ScalUnitary(alpha, x[:n])
+		return
+	}
+	c64.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
+
+// Cswap exchanges the elements of two complex vectors x and y.
+//
+// Complex64 implementations are autogenerated and not directly tested.
+func (Implementation) Cswap(n int, x []complex64, incX int, y []complex64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && (n-1)*incX >= len(x)) || (incX < 0 && (1-n)*incX >= len(x)) {
+		panic(shortX)
+	}
+	if (incY > 0 && (n-1)*incY >= len(y)) || (incY < 0 && (1-n)*incY >= len(y)) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32.go
@@ -0,0 +1,653 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	math "gonum.org/v1/gonum/internal/math32"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+var _ blas.Float32Level1 = Implementation{}
+
+// Snrm2 computes the Euclidean norm of a vector,
+//
+//	sqrt(\sum_i x[i] * x[i]).
+//
+// This function returns 0 if incX is negative.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Snrm2(n int, x []float32, incX int) float32 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return math.Abs(x[0])
+		}
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 {
+		return f32.L2NormUnitary(x[:n])
+	}
+	return f32.L2NormInc(x, uintptr(n), uintptr(incX))
+}
+
+// Sasum computes the sum of the absolute values of the elements of x.
+//
+//	\sum_i |x[i]|
+//
+// Sasum returns 0 if incX is negative.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sasum(n int, x []float32, incX int) float32 {
+	var sum float32
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if incX == 1 {
+		x = x[:n]
+		for _, v := range x {
+			sum += math.Abs(v)
+		}
+		return sum
+	}
+	for i := 0; i < n; i++ {
+		sum += math.Abs(x[i*incX])
+	}
+	return sum
+}
+
+// Isamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Isamax returns -1 if n == 0.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Isamax(n int, x []float32, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return -1
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return 0
+		}
+		if n == 0 {
+			return -1 // Netlib returns invalid index when n == 0.
+		}
+		panic(nLT0)
+	}
+	idx := 0
+	max := math.Abs(x[0])
+	if incX == 1 {
+		for i, v := range x[:n] {
+			absV := math.Abs(v)
+			if absV > max {
+				max = absV
+				idx = i
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		v := x[ix]
+		absV := math.Abs(v)
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Sswap exchanges the elements of two vectors.
+//
+//	x[i], y[i] = y[i], x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sswap(n int, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Scopy copies the elements of x into the elements of y.
+//
+//	y[i] = x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Scopy(n int, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Saxpy adds alpha times x to y
+//
+//	y[i] += alpha * x[i] for all i
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Saxpy(n int, alpha float32, x []float32, incX int, y []float32, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		f32.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	f32.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Srotg computes a plane rotation
+//
+//	⎡  c s ⎤ ⎡ a ⎤ = ⎡ r ⎤
+//	⎣ -s c ⎦ ⎣ b ⎦   ⎣ 0 ⎦
+//
+// satisfying c^2 + s^2 = 1.
+//
+// The computation uses the formulas
+//
+//	sigma = sgn(a)    if |a| >  |b|
+//	      = sgn(b)    if |b| >= |a|
+//	r = sigma*sqrt(a^2 + b^2)
+//	c = 1; s = 0      if r = 0
+//	c = a/r; s = b/r  if r != 0
+//	c >= 0            if |a| > |b|
+//
+// The subroutine also computes
+//
+//	z = s    if |a| > |b|,
+//	  = 1/c  if |b| >= |a| and c != 0
+//	  = 1    if c = 0
+//
+// This allows c and s to be reconstructed from z as follows:
+//
+//	If z = 1, set c = 0, s = 1.
+//	If |z| < 1, set c = sqrt(1 - z^2) and s = z.
+//	If |z| > 1, set c = 1/z and s = sqrt(1 - c^2).
+//
+// NOTE: There is a discrepancy between the reference implementation and the
+// BLAS technical manual regarding the sign for r when a or b are zero. Drotg
+// agrees with the definition in the manual and other common BLAS
+// implementations.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotg(a, b float32) (c, s, r, z float32) {
+	// Implementation based on Supplemental Material to:
+	// Edward Anderson. 2017. Algorithm 978: Safe Scaling in the Level 1 BLAS.
+	// ACM Trans. Math. Softw. 44, 1, Article 12 (July 2017), 28 pages.
+	// DOI: https://doi.org/10.1145/3061665
+	const (
+		safmin = 0x1p-126
+		safmax = 1 / safmin
+	)
+	anorm := math.Abs(a)
+	bnorm := math.Abs(b)
+	switch {
+	case bnorm == 0:
+		c = 1
+		s = 0
+		r = a
+		z = 0
+	case anorm == 0:
+		c = 0
+		s = 1
+		r = b
+		z = 1
+	default:
+		maxab := math.Max(anorm, bnorm)
+		scl := math.Min(math.Max(safmin, maxab), safmax)
+		var sigma float32
+		if anorm > bnorm {
+			sigma = math.Copysign(1, a)
+		} else {
+			sigma = math.Copysign(1, b)
+		}
+		ascl := a / scl
+		bscl := b / scl
+		r = sigma * (scl * math.Sqrt(ascl*ascl+bscl*bscl))
+		c = a / r
+		s = b / r
+		switch {
+		case anorm > bnorm:
+			z = s
+		case c != 0:
+			z = 1 / c
+		default:
+			z = 1
+		}
+	}
+	return c, s, r, z
+}
+
+// Srotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotmg(d1, d2, x1, y1 float32) (p blas.SrotmParams, rd1, rd2, rx1 float32) {
+	// The implementation of Drotmg used here is taken from Hopkins 1997
+	// Appendix A: https://doi.org/10.1145/289251.289253
+	// with the exception of the gam constants below.
+
+	const (
+		gam    = 4096.0
+		gamsq  = gam * gam
+		rgamsq = 1.0 / gamsq
+	)
+
+	if d1 < 0 {
+		p.Flag = blas.Rescaling // Error state.
+		return p, 0, 0, 0
+	}
+
+	if d2 == 0 || y1 == 0 {
+		p.Flag = blas.Identity
+		return p, d1, d2, x1
+	}
+
+	var h11, h12, h21, h22 float32
+	if (d1 == 0 || x1 == 0) && d2 > 0 {
+		p.Flag = blas.Diagonal
+		h12 = 1
+		h21 = -1
+		x1 = y1
+		d1, d2 = d2, d1
+	} else {
+		p2 := d2 * y1
+		p1 := d1 * x1
+		q2 := p2 * y1
+		q1 := p1 * x1
+		if math.Abs(q1) > math.Abs(q2) {
+			p.Flag = blas.OffDiagonal
+			h11 = 1
+			h22 = 1
+			h21 = -y1 / x1
+			h12 = p2 / p1
+			u := 1 - float32(h12*h21)
+			if u <= 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			d1 /= u
+			d2 /= u
+			x1 *= u
+		} else {
+			if q2 < 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			p.Flag = blas.Diagonal
+			h21 = -1
+			h12 = 1
+			h11 = p1 / p2
+			h22 = x1 / y1
+			u := 1 + float32(h11*h22)
+			d1, d2 = d2/u, d1/u
+			x1 = y1 * u
+		}
+	}
+
+	for d1 <= rgamsq && d1 != 0 {
+		p.Flag = blas.Rescaling
+		d1 = (d1 * gam) * gam
+		x1 /= gam
+		h11 /= gam
+		h12 /= gam
+	}
+	for d1 > gamsq {
+		p.Flag = blas.Rescaling
+		d1 = (d1 / gam) / gam
+		x1 *= gam
+		h11 *= gam
+		h12 *= gam
+	}
+
+	for math.Abs(d2) <= rgamsq && d2 != 0 {
+		p.Flag = blas.Rescaling
+		d2 = (d2 * gam) * gam
+		h21 /= gam
+		h22 /= gam
+	}
+	for math.Abs(d2) > gamsq {
+		p.Flag = blas.Rescaling
+		d2 = (d2 / gam) / gam
+		h21 *= gam
+		h22 *= gam
+	}
+
+	switch p.Flag {
+	case blas.Diagonal:
+		p.H = [4]float32{0: h11, 3: h22}
+	case blas.OffDiagonal:
+		p.H = [4]float32{1: h21, 2: h12}
+	case blas.Rescaling:
+		p.H = [4]float32{h11, h21, h12, h22}
+	default:
+		panic(badFlag)
+	}
+
+	return p, d1, d2, x1
+}
+
+// Srot applies a plane transformation.
+//
+//	x[i] = c * x[i] + s * y[i]
+//	y[i] = c * y[i] - s * x[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srot(n int, x []float32, incX int, y []float32, incY int, c float32, s float32) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, vx := range x {
+			vy := y[i]
+			x[i], y[i] = c*vx+s*vy, c*vy-s*vx
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		vx := x[ix]
+		vy := y[iy]
+		x[ix], y[iy] = c*vx+s*vy, c*vy-s*vx
+		ix += incX
+		iy += incY
+	}
+}
+
+// Srotm applies the modified Givens rotation to the 2×n matrix.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Srotm(n int, x []float32, incX int, y []float32, incY int, p blas.SrotmParams) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	if p.Flag == blas.Identity {
+		return
+	}
+
+	switch p.Flag {
+	case blas.Rescaling:
+		h11 := p.H[0]
+		h12 := p.H[2]
+		h21 := p.H[1]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float32(vx*h11)+float32(vy*h12), float32(vx*h21)+float32(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float32(vx*h11)+float32(vy*h12), float32(vx*h21)+float32(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	case blas.OffDiagonal:
+		h12 := p.H[2]
+		h21 := p.H[1]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx+float32(vy*h12), float32(vx*h21)+vy
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx+float32(vy*h12), float32(vx*h21)+vy
+			ix += incX
+			iy += incY
+		}
+	case blas.Diagonal:
+		h11 := p.H[0]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float32(vx*h11)+vy, -vx+float32(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float32(vx*h11)+vy, -vx+float32(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Sscal scales x by alpha.
+//
+//	x[i] *= alpha
+//
+// Sscal has no effect if incX < 0.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sscal(n int, alpha float32, x []float32, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		f32.ScalUnitary(alpha, x[:n])
+		return
+	}
+	f32.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_dsdot.go
@@ -0,0 +1,54 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Dsdot computes the dot product of the two vectors
+//
+//	\sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Dsdot(n int, x []float32, incX int, y []float32, incY int) float64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f32.DdotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f32.DdotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdot.go
@@ -0,0 +1,54 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sdot computes the dot product of the two vectors
+//
+//	\sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sdot(n int, x []float32, incX int, y []float32, incY int) float32 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f32.DotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f32.DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float32_sdsdot.go
@@ -0,0 +1,54 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sdsdot computes the dot product of the two vectors plus a constant
+//
+//	alpha + \sum_i x[i]*y[i]
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sdsdot(n int, alpha float32, x []float32, incX int, y []float32, incY int) float32 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return alpha + float32(f32.DdotUnitary(x[:n], y[:n]))
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return alpha + float32(f32.DdotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy)))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64.go
@@ -0,0 +1,629 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"math"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var _ blas.Float64Level1 = Implementation{}
+
+// Dnrm2 computes the Euclidean norm of a vector,
+//
+//	sqrt(\sum_i x[i] * x[i]).
+//
+// This function returns 0 if incX is negative.
+func (Implementation) Dnrm2(n int, x []float64, incX int) float64 {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return math.Abs(x[0])
+		}
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 {
+		return f64.L2NormUnitary(x[:n])
+	}
+	return f64.L2NormInc(x, uintptr(n), uintptr(incX))
+}
+
+// Dasum computes the sum of the absolute values of the elements of x.
+//
+//	\sum_i |x[i]|
+//
+// Dasum returns 0 if incX is negative.
+func (Implementation) Dasum(n int, x []float64, incX int) float64 {
+	var sum float64
+	if n < 0 {
+		panic(nLT0)
+	}
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return 0
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if incX == 1 {
+		x = x[:n]
+		for _, v := range x {
+			sum += math.Abs(v)
+		}
+		return sum
+	}
+	for i := 0; i < n; i++ {
+		sum += math.Abs(x[i*incX])
+	}
+	return sum
+}
+
+// Idamax returns the index of an element of x with the largest absolute value.
+// If there are multiple such indices the earliest is returned.
+// Idamax returns -1 if n == 0.
+func (Implementation) Idamax(n int, x []float64, incX int) int {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return -1
+	}
+	if len(x) <= (n-1)*incX {
+		panic(shortX)
+	}
+	if n < 2 {
+		if n == 1 {
+			return 0
+		}
+		if n == 0 {
+			return -1 // Netlib returns invalid index when n == 0.
+		}
+		panic(nLT0)
+	}
+	idx := 0
+	max := math.Abs(x[0])
+	if incX == 1 {
+		for i, v := range x[:n] {
+			absV := math.Abs(v)
+			if absV > max {
+				max = absV
+				idx = i
+			}
+		}
+		return idx
+	}
+	ix := incX
+	for i := 1; i < n; i++ {
+		v := x[ix]
+		absV := math.Abs(v)
+		if absV > max {
+			max = absV
+			idx = i
+		}
+		ix += incX
+	}
+	return idx
+}
+
+// Dswap exchanges the elements of two vectors.
+//
+//	x[i], y[i] = y[i], x[i] for all i
+func (Implementation) Dswap(n int, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, v := range x {
+			x[i], y[i] = y[i], v
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		x[ix], y[iy] = y[iy], x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Dcopy copies the elements of x into the elements of y.
+//
+//	y[i] = x[i] for all i
+func (Implementation) Dcopy(n int, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		copy(y[:n], x[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		y[iy] = x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// Daxpy adds alpha times x to y
+//
+//	y[i] += alpha * x[i] for all i
+func (Implementation) Daxpy(n int, alpha float64, x []float64, incX int, y []float64, incY int) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if alpha == 0 {
+		return
+	}
+	if incX == 1 && incY == 1 {
+		f64.AxpyUnitary(alpha, x[:n], y[:n])
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	f64.AxpyInc(alpha, x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
+
+// Drotg computes a plane rotation
+//
+//	⎡  c s ⎤ ⎡ a ⎤ = ⎡ r ⎤
+//	⎣ -s c ⎦ ⎣ b ⎦   ⎣ 0 ⎦
+//
+// satisfying c^2 + s^2 = 1.
+//
+// The computation uses the formulas
+//
+//	sigma = sgn(a)    if |a| >  |b|
+//	      = sgn(b)    if |b| >= |a|
+//	r = sigma*sqrt(a^2 + b^2)
+//	c = 1; s = 0      if r = 0
+//	c = a/r; s = b/r  if r != 0
+//	c >= 0            if |a| > |b|
+//
+// The subroutine also computes
+//
+//	z = s    if |a| > |b|,
+//	  = 1/c  if |b| >= |a| and c != 0
+//	  = 1    if c = 0
+//
+// This allows c and s to be reconstructed from z as follows:
+//
+//	If z = 1, set c = 0, s = 1.
+//	If |z| < 1, set c = sqrt(1 - z^2) and s = z.
+//	If |z| > 1, set c = 1/z and s = sqrt(1 - c^2).
+//
+// NOTE: There is a discrepancy between the reference implementation and the
+// BLAS technical manual regarding the sign for r when a or b are zero. Drotg
+// agrees with the definition in the manual and other common BLAS
+// implementations.
+func (Implementation) Drotg(a, b float64) (c, s, r, z float64) {
+	// Implementation based on Supplemental Material to:
+	// Edward Anderson. 2017. Algorithm 978: Safe Scaling in the Level 1 BLAS.
+	// ACM Trans. Math. Softw. 44, 1, Article 12 (July 2017), 28 pages.
+	// DOI: https://doi.org/10.1145/3061665
+	const (
+		safmin = 0x1p-1022
+		safmax = 1 / safmin
+	)
+	anorm := math.Abs(a)
+	bnorm := math.Abs(b)
+	switch {
+	case bnorm == 0:
+		c = 1
+		s = 0
+		r = a
+		z = 0
+	case anorm == 0:
+		c = 0
+		s = 1
+		r = b
+		z = 1
+	default:
+		maxab := math.Max(anorm, bnorm)
+		scl := math.Min(math.Max(safmin, maxab), safmax)
+		var sigma float64
+		if anorm > bnorm {
+			sigma = math.Copysign(1, a)
+		} else {
+			sigma = math.Copysign(1, b)
+		}
+		ascl := a / scl
+		bscl := b / scl
+		r = sigma * (scl * math.Sqrt(ascl*ascl+bscl*bscl))
+		c = a / r
+		s = b / r
+		switch {
+		case anorm > bnorm:
+			z = s
+		case c != 0:
+			z = 1 / c
+		default:
+			z = 1
+		}
+	}
+	return c, s, r, z
+}
+
+// Drotmg computes the modified Givens rotation. See
+// http://www.netlib.org/lapack/explore-html/df/deb/drotmg_8f.html
+// for more details.
+func (Implementation) Drotmg(d1, d2, x1, y1 float64) (p blas.DrotmParams, rd1, rd2, rx1 float64) {
+	// The implementation of Drotmg used here is taken from Hopkins 1997
+	// Appendix A: https://doi.org/10.1145/289251.289253
+	// with the exception of the gam constants below.
+
+	const (
+		gam    = 4096.0
+		gamsq  = gam * gam
+		rgamsq = 1.0 / gamsq
+	)
+
+	if d1 < 0 {
+		p.Flag = blas.Rescaling // Error state.
+		return p, 0, 0, 0
+	}
+
+	if d2 == 0 || y1 == 0 {
+		p.Flag = blas.Identity
+		return p, d1, d2, x1
+	}
+
+	var h11, h12, h21, h22 float64
+	if (d1 == 0 || x1 == 0) && d2 > 0 {
+		p.Flag = blas.Diagonal
+		h12 = 1
+		h21 = -1
+		x1 = y1
+		d1, d2 = d2, d1
+	} else {
+		p2 := d2 * y1
+		p1 := d1 * x1
+		q2 := p2 * y1
+		q1 := p1 * x1
+		if math.Abs(q1) > math.Abs(q2) {
+			p.Flag = blas.OffDiagonal
+			h11 = 1
+			h22 = 1
+			h21 = -y1 / x1
+			h12 = p2 / p1
+			u := 1 - float64(h12*h21)
+			if u <= 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			d1 /= u
+			d2 /= u
+			x1 *= u
+		} else {
+			if q2 < 0 {
+				p.Flag = blas.Rescaling // Error state.
+				return p, 0, 0, 0
+			}
+
+			p.Flag = blas.Diagonal
+			h21 = -1
+			h12 = 1
+			h11 = p1 / p2
+			h22 = x1 / y1
+			u := 1 + float64(h11*h22)
+			d1, d2 = d2/u, d1/u
+			x1 = y1 * u
+		}
+	}
+
+	for d1 <= rgamsq && d1 != 0 {
+		p.Flag = blas.Rescaling
+		d1 = (d1 * gam) * gam
+		x1 /= gam
+		h11 /= gam
+		h12 /= gam
+	}
+	for d1 > gamsq {
+		p.Flag = blas.Rescaling
+		d1 = (d1 / gam) / gam
+		x1 *= gam
+		h11 *= gam
+		h12 *= gam
+	}
+
+	for math.Abs(d2) <= rgamsq && d2 != 0 {
+		p.Flag = blas.Rescaling
+		d2 = (d2 * gam) * gam
+		h21 /= gam
+		h22 /= gam
+	}
+	for math.Abs(d2) > gamsq {
+		p.Flag = blas.Rescaling
+		d2 = (d2 / gam) / gam
+		h21 *= gam
+		h22 *= gam
+	}
+
+	switch p.Flag {
+	case blas.Diagonal:
+		p.H = [4]float64{0: h11, 3: h22}
+	case blas.OffDiagonal:
+		p.H = [4]float64{1: h21, 2: h12}
+	case blas.Rescaling:
+		p.H = [4]float64{h11, h21, h12, h22}
+	default:
+		panic(badFlag)
+	}
+
+	return p, d1, d2, x1
+}
+
+// Drot applies a plane transformation.
+//
+//	x[i] = c * x[i] + s * y[i]
+//	y[i] = c * y[i] - s * x[i]
+func (Implementation) Drot(n int, x []float64, incX int, y []float64, incY int, c float64, s float64) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+	if incX == 1 && incY == 1 {
+		x = x[:n]
+		for i, vx := range x {
+			vy := y[i]
+			x[i], y[i] = c*vx+s*vy, c*vy-s*vx
+		}
+		return
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	for i := 0; i < n; i++ {
+		vx := x[ix]
+		vy := y[iy]
+		x[ix], y[iy] = c*vx+s*vy, c*vy-s*vx
+		ix += incX
+		iy += incY
+	}
+}
+
+// Drotm applies the modified Givens rotation to the 2×n matrix.
+func (Implementation) Drotm(n int, x []float64, incX int, y []float64, incY int, p blas.DrotmParams) {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (incX > 0 && len(x) <= (n-1)*incX) || (incX < 0 && len(x) <= (1-n)*incX) {
+		panic(shortX)
+	}
+	if (incY > 0 && len(y) <= (n-1)*incY) || (incY < 0 && len(y) <= (1-n)*incY) {
+		panic(shortY)
+	}
+
+	if p.Flag == blas.Identity {
+		return
+	}
+
+	switch p.Flag {
+	case blas.Rescaling:
+		h11 := p.H[0]
+		h12 := p.H[2]
+		h21 := p.H[1]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float64(vx*h11)+float64(vy*h12), float64(vx*h21)+float64(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float64(vx*h11)+float64(vy*h12), float64(vx*h21)+float64(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	case blas.OffDiagonal:
+		h12 := p.H[2]
+		h21 := p.H[1]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = vx+float64(vy*h12), float64(vx*h21)+vy
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = vx+float64(vy*h12), float64(vx*h21)+vy
+			ix += incX
+			iy += incY
+		}
+	case blas.Diagonal:
+		h11 := p.H[0]
+		h22 := p.H[3]
+		if incX == 1 && incY == 1 {
+			x = x[:n]
+			for i, vx := range x {
+				vy := y[i]
+				x[i], y[i] = float64(vx*h11)+vy, -vx+float64(vy*h22)
+			}
+			return
+		}
+		var ix, iy int
+		if incX < 0 {
+			ix = (-n + 1) * incX
+		}
+		if incY < 0 {
+			iy = (-n + 1) * incY
+		}
+		for i := 0; i < n; i++ {
+			vx := x[ix]
+			vy := y[iy]
+			x[ix], y[iy] = float64(vx*h11)+vy, -vx+float64(vy*h22)
+			ix += incX
+			iy += incY
+		}
+	}
+}
+
+// Dscal scales x by alpha.
+//
+//	x[i] *= alpha
+//
+// Dscal has no effect if incX < 0.
+func (Implementation) Dscal(n int, alpha float64, x []float64, incX int) {
+	if incX < 1 {
+		if incX == 0 {
+			panic(zeroIncX)
+		}
+		return
+	}
+	if n < 1 {
+		if n == 0 {
+			return
+		}
+		panic(nLT0)
+	}
+	if (n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if alpha == 0 {
+		if incX == 1 {
+			x = x[:n]
+			for i := range x {
+				x[i] = 0
+			}
+			return
+		}
+		for ix := 0; ix < n*incX; ix += incX {
+			x[ix] = 0
+		}
+		return
+	}
+	if incX == 1 {
+		f64.ScalUnitary(alpha, x[:n])
+		return
+	}
+	f64.ScalInc(alpha, x, uintptr(n), uintptr(incX))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level1float64_ddot.go
@@ -0,0 +1,50 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+// Ddot computes the dot product of the two vectors
+//
+//	\sum_i x[i]*y[i]
+func (Implementation) Ddot(n int, x []float64, incX int, y []float64, incY int) float64 {
+	if incX == 0 {
+		panic(zeroIncX)
+	}
+	if incY == 0 {
+		panic(zeroIncY)
+	}
+	if n <= 0 {
+		if n == 0 {
+			return 0
+		}
+		panic(nLT0)
+	}
+	if incX == 1 && incY == 1 {
+		if len(x) < n {
+			panic(shortX)
+		}
+		if len(y) < n {
+			panic(shortY)
+		}
+		return f64.DotUnitary(x[:n], y[:n])
+	}
+	var ix, iy int
+	if incX < 0 {
+		ix = (-n + 1) * incX
+	}
+	if incY < 0 {
+		iy = (-n + 1) * incY
+	}
+	if ix >= len(x) || ix+(n-1)*incX >= len(x) {
+		panic(shortX)
+	}
+	if iy >= len(y) || iy+(n-1)*incY >= len(y) {
+		panic(shortY)
+	}
+	return f64.DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx128.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2cmplx64.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2float32.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level2float64.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx128.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3cmplx64.go
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3float32.go
@@ -0,0 +1,925 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+var _ blas.Float32Level3 = Implementation{}
+
+// Strsm solves one of the matrix equations
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and side == blas.Left
+//	Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	X * A = alpha * B   if tA == blas.NoTrans and side == blas.Right
+//	X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
+// scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					btmp := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						f32.ScalUnitary(alpha, btmp)
+					}
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						if va != 0 {
+							k := ka + i + 1
+							f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+					if nonUnit {
+						tmp := 1 / a[i*lda+i]
+						f32.ScalUnitary(tmp, btmp)
+					}
+				}
+				return
+			}
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmp)
+				}
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f32.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+				if nonUnit {
+					tmp := 1 / a[i*lda+i]
+					f32.ScalUnitary(tmp, btmp)
+				}
+			}
+			return
+		}
+		// Cases where a is transposed
+		if ul == blas.Upper {
+			for k := 0; k < m; k++ {
+				btmpk := b[k*ldb : k*ldb+n]
+				if nonUnit {
+					tmp := 1 / a[k*lda+k]
+					f32.ScalUnitary(tmp, btmpk)
+				}
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					if va != 0 {
+						i := ia + k + 1
+						f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+					}
+				}
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmpk)
+				}
+			}
+			return
+		}
+		for k := m - 1; k >= 0; k-- {
+			btmpk := b[k*ldb : k*ldb+n]
+			if nonUnit {
+				tmp := 1 / a[k*lda+k]
+				f32.ScalUnitary(tmp, btmpk)
+			}
+			for i, va := range a[k*lda : k*lda+k] {
+				if va != 0 {
+					f32.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+				}
+			}
+			if alpha != 1 {
+				f32.ScalUnitary(alpha, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is to the right of X.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f32.ScalUnitary(alpha, btmp)
+				}
+				for k, vb := range btmp {
+					if vb == 0 {
+						continue
+					}
+					if nonUnit {
+						btmp[k] /= a[k*lda+k]
+					}
+					f32.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			if alpha != 1 {
+				f32.ScalUnitary(alpha, btmp)
+			}
+			for k := n - 1; k >= 0; k-- {
+				if btmp[k] == 0 {
+					continue
+				}
+				if nonUnit {
+					btmp[k] /= a[k*lda+k]
+				}
+				f32.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
+				if nonUnit {
+					tmp /= a[j*lda+j]
+				}
+				btmp[j] = tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := 0; j < n; j++ {
+			tmp := alpha*btmp[j] - f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			if nonUnit {
+				tmp /= a[j*lda+j]
+			}
+			btmp[j] = tmp
+		}
+	}
+}
+
+// Ssymm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C  if side == blas.Left
+//	C = alpha * B * A + beta * C  if side == blas.Right
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
+// is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssymm(s blas.Side, ul blas.Uplo, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	if s != blas.Right && s != blas.Left {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if beta == 0 {
+		for i := 0; i < m; i++ {
+			ctmp := c[i*ldc : i*ldc+n]
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		}
+	}
+
+	if alpha == 0 {
+		if beta != 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := 0; j < n; j++ {
+					ctmp[j] *= beta
+				}
+			}
+		}
+		return
+	}
+
+	isUpper := ul == blas.Upper
+	if s == blas.Left {
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			btmp := b[i*ldb : i*ldb+n]
+			ctmp := c[i*ldc : i*ldc+n]
+			for j, v := range btmp {
+				ctmp[j] *= beta
+				ctmp[j] += atmp * v
+			}
+
+			for k := 0; k < i; k++ {
+				var atmp float32
+				if isUpper {
+					atmp = a[k*lda+i]
+				} else {
+					atmp = a[i*lda+k]
+				}
+				atmp *= alpha
+				f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+			for k := i + 1; k < m; k++ {
+				var atmp float32
+				if isUpper {
+					atmp = a[i*lda+k]
+				} else {
+					atmp = a[k*lda+i]
+				}
+				atmp *= alpha
+				f32.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+		}
+		return
+	}
+	if isUpper {
+		for i := 0; i < m; i++ {
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha * b[i*ldb+j]
+				var tmp2 float32
+				atmp := a[j*lda+j+1 : j*lda+n]
+				btmp := b[i*ldb+j+1 : i*ldb+n]
+				ctmp := c[i*ldc+j+1 : i*ldc+n]
+				for k, v := range atmp {
+					ctmp[k] += tmp * v
+					tmp2 += btmp[k] * v
+				}
+				c[i*ldc+j] *= beta
+				c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		for j := 0; j < n; j++ {
+			tmp := alpha * b[i*ldb+j]
+			var tmp2 float32
+			atmp := a[j*lda : j*lda+j]
+			btmp := b[i*ldb : i*ldb+j]
+			ctmp := c[i*ldc : i*ldc+j]
+			for k, v := range atmp {
+				ctmp[k] += tmp * v
+				tmp2 += btmp[k] * v
+			}
+			c[i*ldc+j] *= beta
+			c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+		}
+	}
+}
+
+// Ssyrk performs one of the symmetric rank-k operations
+//
+//	C = alpha * A * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
+// beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, beta float32, c []float32, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				atmp := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := jc + i
+						ctmp[jc] = alpha * f32.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, vc := range ctmp {
+						j := jc + i
+						ctmp[jc] = vc*beta + alpha*f32.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			atmp := a[i*lda : i*lda+k]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = alpha * f32.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			} else {
+				for j, vc := range ctmp {
+					ctmp[j] = vc*beta + alpha*f32.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			} else if beta != 1 {
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp := alpha * a[l*lda+i]
+				if tmp != 0 {
+					f32.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		if beta != 1 {
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp := alpha * a[l*lda+i]
+			if tmp != 0 {
+				f32.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
+			}
+		}
+	}
+}
+
+// Ssyr2k performs one of the symmetric rank 2k operations
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
+// alpha and beta are scalars.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Ssyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldb < max(1, col) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(b) < ldb*(row-1)+col {
+		panic(shortB)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda : i*lda+k]
+				btmp := b[i*ldb : i*ldb+k]
+				ctmp := c[i*ldc+i : i*ldc+n]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float32
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] = alpha * (tmp1 + tmp2)
+					}
+				} else {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float32
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] *= beta
+						ctmp[jc] += alpha * (tmp1 + tmp2)
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda : i*lda+k]
+			btmp := b[i*ldb : i*ldb+k]
+			ctmp := c[i*ldc : i*ldc+i+1]
+			if beta == 0 {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float32
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] = alpha * (tmp1 + tmp2)
+				}
+			} else {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float32
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] *= beta
+					ctmp[j] += alpha * (tmp1 + tmp2)
+				}
+			}
+		}
+		return
+	}
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			switch beta {
+			case 0:
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			case 1:
+			default:
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp1 := alpha * b[l*ldb+i]
+				tmp2 := alpha * a[l*lda+i]
+				btmp := b[l*ldb+i : l*ldb+n]
+				if tmp1 != 0 || tmp2 != 0 {
+					for j, v := range a[l*lda+i : l*lda+n] {
+						ctmp[j] += v*tmp1 + btmp[j]*tmp2
+					}
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		switch beta {
+		case 0:
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		case 1:
+		default:
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp1 := alpha * b[l*ldb+i]
+			tmp2 := alpha * a[l*lda+i]
+			btmp := b[l*ldb : l*ldb+i+1]
+			if tmp1 != 0 || tmp2 != 0 {
+				for j, v := range a[l*lda : l*lda+i+1] {
+					ctmp[j] += v*tmp1 + btmp[j]*tmp2
+				}
+			}
+		}
+	}
+}
+
+// Strmm performs one of the matrix-matrix operations
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and side == blas.Left
+//	B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	B = alpha * B * A   if tA == blas.NoTrans and side == blas.Right
+//	B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Strmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float32, a []float32, lda int, b []float32, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := 0; i < m; i++ {
+					tmp := alpha
+					if nonUnit {
+						tmp *= a[i*lda+i]
+					}
+					btmp := b[i*ldb : i*ldb+n]
+					f32.ScalUnitary(tmp, btmp)
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						k := ka + i + 1
+						if va != 0 {
+							f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+				}
+				return
+			}
+			for i := m - 1; i >= 0; i-- {
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[i*lda+i]
+				}
+				btmp := b[i*ldb : i*ldb+n]
+				f32.ScalUnitary(tmp, btmp)
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f32.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+			}
+			return
+		}
+		// Cases where a is transposed.
+		if ul == blas.Upper {
+			for k := m - 1; k >= 0; k-- {
+				btmpk := b[k*ldb : k*ldb+n]
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					i := ia + k + 1
+					btmp := b[i*ldb : i*ldb+n]
+					if va != 0 {
+						f32.AxpyUnitary(alpha*va, btmpk, btmp)
+					}
+				}
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[k*lda+k]
+				}
+				if tmp != 1 {
+					f32.ScalUnitary(tmp, btmpk)
+				}
+			}
+			return
+		}
+		for k := 0; k < m; k++ {
+			btmpk := b[k*ldb : k*ldb+n]
+			for i, va := range a[k*lda : k*lda+k] {
+				btmp := b[i*ldb : i*ldb+n]
+				if va != 0 {
+					f32.AxpyUnitary(alpha*va, btmpk, btmp)
+				}
+			}
+			tmp := alpha
+			if nonUnit {
+				tmp *= a[k*lda+k]
+			}
+			if tmp != 1 {
+				f32.ScalUnitary(tmp, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is on the right
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				for k := n - 1; k >= 0; k-- {
+					tmp := alpha * btmp[k]
+					if tmp == 0 {
+						continue
+					}
+					btmp[k] = tmp
+					if nonUnit {
+						btmp[k] *= a[k*lda+k]
+					}
+					f32.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for k := 0; k < n; k++ {
+				tmp := alpha * btmp[k]
+				if tmp == 0 {
+					continue
+				}
+				btmp[k] = tmp
+				if nonUnit {
+					btmp[k] *= a[k*lda+k]
+				}
+				f32.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j, vb := range btmp {
+				tmp := vb
+				if nonUnit {
+					tmp *= a[j*lda+j]
+				}
+				tmp += f32.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
+				btmp[j] = alpha * tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := n - 1; j >= 0; j-- {
+			tmp := btmp[j]
+			if nonUnit {
+				tmp *= a[j*lda+j]
+			}
+			tmp += f32.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			btmp[j] = alpha * tmp
+		}
+	}
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/level3float64.go
@@ -0,0 +1,913 @@
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+var _ blas.Float64Level3 = Implementation{}
+
+// Dtrsm solves one of the matrix equations
+//
+//	A * X = alpha * B   if tA == blas.NoTrans and side == blas.Left
+//	Aᵀ * X = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	X * A = alpha * B   if tA == blas.NoTrans and side == blas.Right
+//	X * Aᵀ = alpha * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, X and B are m×n matrices, and alpha is a
+// scalar.
+//
+// At entry to the function, X contains the values of B, and the result is
+// stored in-place into X.
+//
+// No check is made that A is invertible.
+func (Implementation) Dtrsm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := m - 1; i >= 0; i-- {
+					btmp := b[i*ldb : i*ldb+n]
+					if alpha != 1 {
+						f64.ScalUnitary(alpha, btmp)
+					}
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						if va != 0 {
+							k := ka + i + 1
+							f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+					if nonUnit {
+						tmp := 1 / a[i*lda+i]
+						f64.ScalUnitary(tmp, btmp)
+					}
+				}
+				return
+			}
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmp)
+				}
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f64.AxpyUnitary(-va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+				if nonUnit {
+					tmp := 1 / a[i*lda+i]
+					f64.ScalUnitary(tmp, btmp)
+				}
+			}
+			return
+		}
+		// Cases where a is transposed
+		if ul == blas.Upper {
+			for k := 0; k < m; k++ {
+				btmpk := b[k*ldb : k*ldb+n]
+				if nonUnit {
+					tmp := 1 / a[k*lda+k]
+					f64.ScalUnitary(tmp, btmpk)
+				}
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					if va != 0 {
+						i := ia + k + 1
+						f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+					}
+				}
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmpk)
+				}
+			}
+			return
+		}
+		for k := m - 1; k >= 0; k-- {
+			btmpk := b[k*ldb : k*ldb+n]
+			if nonUnit {
+				tmp := 1 / a[k*lda+k]
+				f64.ScalUnitary(tmp, btmpk)
+			}
+			for i, va := range a[k*lda : k*lda+k] {
+				if va != 0 {
+					f64.AxpyUnitary(-va, btmpk, b[i*ldb:i*ldb+n])
+				}
+			}
+			if alpha != 1 {
+				f64.ScalUnitary(alpha, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is to the right of X.
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				if alpha != 1 {
+					f64.ScalUnitary(alpha, btmp)
+				}
+				for k, vb := range btmp {
+					if vb == 0 {
+						continue
+					}
+					if nonUnit {
+						btmp[k] /= a[k*lda+k]
+					}
+					f64.AxpyUnitary(-btmp[k], a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			if alpha != 1 {
+				f64.ScalUnitary(alpha, btmp)
+			}
+			for k := n - 1; k >= 0; k-- {
+				if btmp[k] == 0 {
+					continue
+				}
+				if nonUnit {
+					btmp[k] /= a[k*lda+k]
+				}
+				f64.AxpyUnitary(-btmp[k], a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:])
+				if nonUnit {
+					tmp /= a[j*lda+j]
+				}
+				btmp[j] = tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := 0; j < n; j++ {
+			tmp := alpha*btmp[j] - f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			if nonUnit {
+				tmp /= a[j*lda+j]
+			}
+			btmp[j] = tmp
+		}
+	}
+}
+
+// Dsymm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C  if side == blas.Left
+//	C = alpha * B * A + beta * C  if side == blas.Right
+//
+// where A is an n×n or m×m symmetric matrix, B and C are m×n matrices, and alpha
+// is a scalar.
+func (Implementation) Dsymm(s blas.Side, ul blas.Uplo, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	if s != blas.Right && s != blas.Left {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+	if len(c) < ldc*(m-1)+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if alpha == 0 && beta == 1 {
+		return
+	}
+
+	if beta == 0 {
+		for i := 0; i < m; i++ {
+			ctmp := c[i*ldc : i*ldc+n]
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		}
+	}
+
+	if alpha == 0 {
+		if beta != 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := 0; j < n; j++ {
+					ctmp[j] *= beta
+				}
+			}
+		}
+		return
+	}
+
+	isUpper := ul == blas.Upper
+	if s == blas.Left {
+		for i := 0; i < m; i++ {
+			atmp := alpha * a[i*lda+i]
+			btmp := b[i*ldb : i*ldb+n]
+			ctmp := c[i*ldc : i*ldc+n]
+			for j, v := range btmp {
+				ctmp[j] *= beta
+				ctmp[j] += atmp * v
+			}
+
+			for k := 0; k < i; k++ {
+				var atmp float64
+				if isUpper {
+					atmp = a[k*lda+i]
+				} else {
+					atmp = a[i*lda+k]
+				}
+				atmp *= alpha
+				f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+			for k := i + 1; k < m; k++ {
+				var atmp float64
+				if isUpper {
+					atmp = a[i*lda+k]
+				} else {
+					atmp = a[k*lda+i]
+				}
+				atmp *= alpha
+				f64.AxpyUnitary(atmp, b[k*ldb:k*ldb+n], ctmp)
+			}
+		}
+		return
+	}
+	if isUpper {
+		for i := 0; i < m; i++ {
+			for j := n - 1; j >= 0; j-- {
+				tmp := alpha * b[i*ldb+j]
+				var tmp2 float64
+				atmp := a[j*lda+j+1 : j*lda+n]
+				btmp := b[i*ldb+j+1 : i*ldb+n]
+				ctmp := c[i*ldc+j+1 : i*ldc+n]
+				for k, v := range atmp {
+					ctmp[k] += tmp * v
+					tmp2 += btmp[k] * v
+				}
+				c[i*ldc+j] *= beta
+				c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		for j := 0; j < n; j++ {
+			tmp := alpha * b[i*ldb+j]
+			var tmp2 float64
+			atmp := a[j*lda : j*lda+j]
+			btmp := b[i*ldb : i*ldb+j]
+			ctmp := c[i*ldc : i*ldc+j]
+			for k, v := range atmp {
+				ctmp[k] += tmp * v
+				tmp2 += btmp[k] * v
+			}
+			c[i*ldc+j] *= beta
+			c[i*ldc+j] += tmp*a[j*lda+j] + alpha*tmp2
+		}
+	}
+}
+
+// Dsyrk performs one of the symmetric rank-k operations
+//
+//	C = alpha * A * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A is an n×k or k×n matrix, C is an n×n symmetric matrix, and alpha and
+// beta are scalars.
+func (Implementation) Dsyrk(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, beta float64, c []float64, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				atmp := a[i*lda : i*lda+k]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := jc + i
+						ctmp[jc] = alpha * f64.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				} else {
+					for jc, vc := range ctmp {
+						j := jc + i
+						ctmp[jc] = vc*beta + alpha*f64.DotUnitary(atmp, a[j*lda:j*lda+k])
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			atmp := a[i*lda : i*lda+k]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = alpha * f64.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			} else {
+				for j, vc := range ctmp {
+					ctmp[j] = vc*beta + alpha*f64.DotUnitary(a[j*lda:j*lda+k], atmp)
+				}
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			if beta == 0 {
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			} else if beta != 1 {
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp := alpha * a[l*lda+i]
+				if tmp != 0 {
+					f64.AxpyUnitary(tmp, a[l*lda+i:l*lda+n], ctmp)
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		if beta != 1 {
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp := alpha * a[l*lda+i]
+			if tmp != 0 {
+				f64.AxpyUnitary(tmp, a[l*lda:l*lda+i+1], ctmp)
+			}
+		}
+	}
+}
+
+// Dsyr2k performs one of the symmetric rank 2k operations
+//
+//	C = alpha * A * Bᵀ + alpha * B * Aᵀ + beta * C  if tA == blas.NoTrans
+//	C = alpha * Aᵀ * B + alpha * Bᵀ * A + beta * C  if tA == blas.Trans or tA == blas.ConjTrans
+//
+// where A and B are n×k or k×n matrices, C is an n×n symmetric matrix, and
+// alpha and beta are scalars.
+func (Implementation) Dsyr2k(ul blas.Uplo, tA blas.Transpose, n, k int, alpha float64, a []float64, lda int, b []float64, ldb int, beta float64, c []float64, ldc int) {
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.Trans && tA != blas.NoTrans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	row, col := k, n
+	if tA == blas.NoTrans {
+		row, col = n, k
+	}
+	if lda < max(1, col) {
+		panic(badLdA)
+	}
+	if ldb < max(1, col) {
+		panic(badLdB)
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(row-1)+col {
+		panic(shortA)
+	}
+	if len(b) < ldb*(row-1)+col {
+		panic(shortB)
+	}
+	if len(c) < ldc*(n-1)+n {
+		panic(shortC)
+	}
+
+	if alpha == 0 {
+		if beta == 0 {
+			if ul == blas.Upper {
+				for i := 0; i < n; i++ {
+					ctmp := c[i*ldc+i : i*ldc+n]
+					for j := range ctmp {
+						ctmp[j] = 0
+					}
+				}
+				return
+			}
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc : i*ldc+i+1]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+			return
+		}
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				ctmp := c[i*ldc+i : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc : i*ldc+i+1]
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		return
+	}
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < n; i++ {
+				atmp := a[i*lda : i*lda+k]
+				btmp := b[i*ldb : i*ldb+k]
+				ctmp := c[i*ldc+i : i*ldc+n]
+				if beta == 0 {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float64
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] = alpha * (tmp1 + tmp2)
+					}
+				} else {
+					for jc := range ctmp {
+						j := i + jc
+						var tmp1, tmp2 float64
+						binner := b[j*ldb : j*ldb+k]
+						for l, v := range a[j*lda : j*lda+k] {
+							tmp1 += v * btmp[l]
+							tmp2 += atmp[l] * binner[l]
+						}
+						ctmp[jc] *= beta
+						ctmp[jc] += alpha * (tmp1 + tmp2)
+					}
+				}
+			}
+			return
+		}
+		for i := 0; i < n; i++ {
+			atmp := a[i*lda : i*lda+k]
+			btmp := b[i*ldb : i*ldb+k]
+			ctmp := c[i*ldc : i*ldc+i+1]
+			if beta == 0 {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float64
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] = alpha * (tmp1 + tmp2)
+				}
+			} else {
+				for j := 0; j <= i; j++ {
+					var tmp1, tmp2 float64
+					binner := b[j*ldb : j*ldb+k]
+					for l, v := range a[j*lda : j*lda+k] {
+						tmp1 += v * btmp[l]
+						tmp2 += atmp[l] * binner[l]
+					}
+					ctmp[j] *= beta
+					ctmp[j] += alpha * (tmp1 + tmp2)
+				}
+			}
+		}
+		return
+	}
+	if ul == blas.Upper {
+		for i := 0; i < n; i++ {
+			ctmp := c[i*ldc+i : i*ldc+n]
+			switch beta {
+			case 0:
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			case 1:
+			default:
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+			for l := 0; l < k; l++ {
+				tmp1 := alpha * b[l*ldb+i]
+				tmp2 := alpha * a[l*lda+i]
+				btmp := b[l*ldb+i : l*ldb+n]
+				if tmp1 != 0 || tmp2 != 0 {
+					for j, v := range a[l*lda+i : l*lda+n] {
+						ctmp[j] += v*tmp1 + btmp[j]*tmp2
+					}
+				}
+			}
+		}
+		return
+	}
+	for i := 0; i < n; i++ {
+		ctmp := c[i*ldc : i*ldc+i+1]
+		switch beta {
+		case 0:
+			for j := range ctmp {
+				ctmp[j] = 0
+			}
+		case 1:
+		default:
+			for j := range ctmp {
+				ctmp[j] *= beta
+			}
+		}
+		for l := 0; l < k; l++ {
+			tmp1 := alpha * b[l*ldb+i]
+			tmp2 := alpha * a[l*lda+i]
+			btmp := b[l*ldb : l*ldb+i+1]
+			if tmp1 != 0 || tmp2 != 0 {
+				for j, v := range a[l*lda : l*lda+i+1] {
+					ctmp[j] += v*tmp1 + btmp[j]*tmp2
+				}
+			}
+		}
+	}
+}
+
+// Dtrmm performs one of the matrix-matrix operations
+//
+//	B = alpha * A * B   if tA == blas.NoTrans and side == blas.Left
+//	B = alpha * Aᵀ * B  if tA == blas.Trans or blas.ConjTrans, and side == blas.Left
+//	B = alpha * B * A   if tA == blas.NoTrans and side == blas.Right
+//	B = alpha * B * Aᵀ  if tA == blas.Trans or blas.ConjTrans, and side == blas.Right
+//
+// where A is an n×n or m×m triangular matrix, B is an m×n matrix, and alpha is a scalar.
+func (Implementation) Dtrmm(s blas.Side, ul blas.Uplo, tA blas.Transpose, d blas.Diag, m, n int, alpha float64, a []float64, lda int, b []float64, ldb int) {
+	if s != blas.Left && s != blas.Right {
+		panic(badSide)
+	}
+	if ul != blas.Lower && ul != blas.Upper {
+		panic(badUplo)
+	}
+	if tA != blas.NoTrans && tA != blas.Trans && tA != blas.ConjTrans {
+		panic(badTranspose)
+	}
+	if d != blas.NonUnit && d != blas.Unit {
+		panic(badDiag)
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	k := n
+	if s == blas.Left {
+		k = m
+	}
+	if lda < max(1, k) {
+		panic(badLdA)
+	}
+	if ldb < max(1, n) {
+		panic(badLdB)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if len(a) < lda*(k-1)+k {
+		panic(shortA)
+	}
+	if len(b) < ldb*(m-1)+n {
+		panic(shortB)
+	}
+
+	if alpha == 0 {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j := range btmp {
+				btmp[j] = 0
+			}
+		}
+		return
+	}
+
+	nonUnit := d == blas.NonUnit
+	if s == blas.Left {
+		if tA == blas.NoTrans {
+			if ul == blas.Upper {
+				for i := 0; i < m; i++ {
+					tmp := alpha
+					if nonUnit {
+						tmp *= a[i*lda+i]
+					}
+					btmp := b[i*ldb : i*ldb+n]
+					f64.ScalUnitary(tmp, btmp)
+					for ka, va := range a[i*lda+i+1 : i*lda+m] {
+						k := ka + i + 1
+						if va != 0 {
+							f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+						}
+					}
+				}
+				return
+			}
+			for i := m - 1; i >= 0; i-- {
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[i*lda+i]
+				}
+				btmp := b[i*ldb : i*ldb+n]
+				f64.ScalUnitary(tmp, btmp)
+				for k, va := range a[i*lda : i*lda+i] {
+					if va != 0 {
+						f64.AxpyUnitary(alpha*va, b[k*ldb:k*ldb+n], btmp)
+					}
+				}
+			}
+			return
+		}
+		// Cases where a is transposed.
+		if ul == blas.Upper {
+			for k := m - 1; k >= 0; k-- {
+				btmpk := b[k*ldb : k*ldb+n]
+				for ia, va := range a[k*lda+k+1 : k*lda+m] {
+					i := ia + k + 1
+					btmp := b[i*ldb : i*ldb+n]
+					if va != 0 {
+						f64.AxpyUnitary(alpha*va, btmpk, btmp)
+					}
+				}
+				tmp := alpha
+				if nonUnit {
+					tmp *= a[k*lda+k]
+				}
+				if tmp != 1 {
+					f64.ScalUnitary(tmp, btmpk)
+				}
+			}
+			return
+		}
+		for k := 0; k < m; k++ {
+			btmpk := b[k*ldb : k*ldb+n]
+			for i, va := range a[k*lda : k*lda+k] {
+				btmp := b[i*ldb : i*ldb+n]
+				if va != 0 {
+					f64.AxpyUnitary(alpha*va, btmpk, btmp)
+				}
+			}
+			tmp := alpha
+			if nonUnit {
+				tmp *= a[k*lda+k]
+			}
+			if tmp != 1 {
+				f64.ScalUnitary(tmp, btmpk)
+			}
+		}
+		return
+	}
+	// Cases where a is on the right
+	if tA == blas.NoTrans {
+		if ul == blas.Upper {
+			for i := 0; i < m; i++ {
+				btmp := b[i*ldb : i*ldb+n]
+				for k := n - 1; k >= 0; k-- {
+					tmp := alpha * btmp[k]
+					if tmp == 0 {
+						continue
+					}
+					btmp[k] = tmp
+					if nonUnit {
+						btmp[k] *= a[k*lda+k]
+					}
+					f64.AxpyUnitary(tmp, a[k*lda+k+1:k*lda+n], btmp[k+1:n])
+				}
+			}
+			return
+		}
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for k := 0; k < n; k++ {
+				tmp := alpha * btmp[k]
+				if tmp == 0 {
+					continue
+				}
+				btmp[k] = tmp
+				if nonUnit {
+					btmp[k] *= a[k*lda+k]
+				}
+				f64.AxpyUnitary(tmp, a[k*lda:k*lda+k], btmp[:k])
+			}
+		}
+		return
+	}
+	// Cases where a is transposed.
+	if ul == blas.Upper {
+		for i := 0; i < m; i++ {
+			btmp := b[i*ldb : i*ldb+n]
+			for j, vb := range btmp {
+				tmp := vb
+				if nonUnit {
+					tmp *= a[j*lda+j]
+				}
+				tmp += f64.DotUnitary(a[j*lda+j+1:j*lda+n], btmp[j+1:n])
+				btmp[j] = alpha * tmp
+			}
+		}
+		return
+	}
+	for i := 0; i < m; i++ {
+		btmp := b[i*ldb : i*ldb+n]
+		for j := n - 1; j >= 0; j-- {
+			tmp := btmp[j]
+			if nonUnit {
+				tmp *= a[j*lda+j]
+			}
+			tmp += f64.DotUnitary(a[j*lda:j*lda+j], btmp[:j])
+			btmp[j] = alpha * tmp
+		}
+	}
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/sgemm.go
@@ -0,0 +1,301 @@
+// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.
+
+// Copyright ©2014 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package gonum
+
+import (
+	"runtime"
+	"sync"
+
+	"gonum.org/v1/gonum/blas"
+	"gonum.org/v1/gonum/internal/asm/f32"
+)
+
+// Sgemm performs one of the matrix-matrix operations
+//
+//	C = alpha * A * B + beta * C
+//	C = alpha * Aᵀ * B + beta * C
+//	C = alpha * A * Bᵀ + beta * C
+//	C = alpha * Aᵀ * Bᵀ + beta * C
+//
+// where A is an m×k or k×m dense matrix, B is an n×k or k×n dense matrix, C is
+// an m×n matrix, and alpha and beta are scalars. tA and tB specify whether A or
+// B are transposed.
+//
+// Float32 implementations are autogenerated and not directly tested.
+func (Implementation) Sgemm(tA, tB blas.Transpose, m, n, k int, alpha float32, a []float32, lda int, b []float32, ldb int, beta float32, c []float32, ldc int) {
+	switch tA {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	switch tB {
+	default:
+		panic(badTranspose)
+	case blas.NoTrans, blas.Trans, blas.ConjTrans:
+	}
+	if m < 0 {
+		panic(mLT0)
+	}
+	if n < 0 {
+		panic(nLT0)
+	}
+	if k < 0 {
+		panic(kLT0)
+	}
+	aTrans := tA == blas.Trans || tA == blas.ConjTrans
+	if aTrans {
+		if lda < max(1, m) {
+			panic(badLdA)
+		}
+	} else {
+		if lda < max(1, k) {
+			panic(badLdA)
+		}
+	}
+	bTrans := tB == blas.Trans || tB == blas.ConjTrans
+	if bTrans {
+		if ldb < max(1, k) {
+			panic(badLdB)
+		}
+	} else {
+		if ldb < max(1, n) {
+			panic(badLdB)
+		}
+	}
+	if ldc < max(1, n) {
+		panic(badLdC)
+	}
+
+	// Quick return if possible.
+	if m == 0 || n == 0 {
+		return
+	}
+
+	// For zero matrix size the following slice length checks are trivially satisfied.
+	if aTrans {
+		if len(a) < (k-1)*lda+m {
+			panic(shortA)
+		}
+	} else {
+		if len(a) < (m-1)*lda+k {
+			panic(shortA)
+		}
+	}
+	if bTrans {
+		if len(b) < (n-1)*ldb+k {
+			panic(shortB)
+		}
+	} else {
+		if len(b) < (k-1)*ldb+n {
+			panic(shortB)
+		}
+	}
+	if len(c) < (m-1)*ldc+n {
+		panic(shortC)
+	}
+
+	// Quick return if possible.
+	if (alpha == 0 || k == 0) && beta == 1 {
+		return
+	}
+
+	// scale c
+	if beta != 1 {
+		if beta == 0 {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] = 0
+				}
+			}
+		} else {
+			for i := 0; i < m; i++ {
+				ctmp := c[i*ldc : i*ldc+n]
+				for j := range ctmp {
+					ctmp[j] *= beta
+				}
+			}
+		}
+	}
+
+	sgemmParallel(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+}
+
+func sgemmParallel(aTrans, bTrans bool, m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// dgemmParallel computes a parallel matrix multiplication by partitioning
+	// a and b into sub-blocks, and updating c with the multiplication of the sub-block
+	// In all cases,
+	// A = [ 	A_11	A_12 ... 	A_1j
+	//			A_21	A_22 ...	A_2j
+	//				...
+	//			A_i1	A_i2 ...	A_ij]
+	//
+	// and same for B. All of the submatrix sizes are blockSize×blockSize except
+	// at the edges.
+	//
+	// In all cases, there is one dimension for each matrix along which
+	// C must be updated sequentially.
+	// Cij = \sum_k Aik Bki,	(A * B)
+	// Cij = \sum_k Aki Bkj,	(Aᵀ * B)
+	// Cij = \sum_k Aik Bjk,	(A * Bᵀ)
+	// Cij = \sum_k Aki Bjk,	(Aᵀ * Bᵀ)
+	//
+	// This code computes one {i, j} block sequentially along the k dimension,
+	// and computes all of the {i, j} blocks concurrently. This
+	// partitioning allows Cij to be updated in-place without race-conditions.
+	// Instead of launching a goroutine for each possible concurrent computation,
+	// a number of worker goroutines are created and channels are used to pass
+	// available and completed cases.
+	//
+	// http://alexkr.com/docs/matrixmult.pdf is a good reference on matrix-matrix
+	// multiplies, though this code does not copy matrices to attempt to eliminate
+	// cache misses.
+
+	maxKLen := k
+	parBlocks := blocks(m, blockSize) * blocks(n, blockSize)
+	if parBlocks < minParBlock {
+		// The matrix multiplication is small in the dimensions where it can be
+		// computed concurrently. Just do it in serial.
+		sgemmSerial(aTrans, bTrans, m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	}
+
+	// workerLimit acts a number of maximum concurrent workers,
+	// with the limit set to the number of procs available.
+	workerLimit := make(chan struct{}, runtime.GOMAXPROCS(0))
+
+	// wg is used to wait for all
+	var wg sync.WaitGroup
+	wg.Add(parBlocks)
+	defer wg.Wait()
+
+	for i := 0; i < m; i += blockSize {
+		for j := 0; j < n; j += blockSize {
+			workerLimit <- struct{}{}
+			go func(i, j int) {
+				defer func() {
+					wg.Done()
+					<-workerLimit
+				}()
+
+				leni := blockSize
+				if i+leni > m {
+					leni = m - i
+				}
+				lenj := blockSize
+				if j+lenj > n {
+					lenj = n - j
+				}
+
+				cSub := sliceView32(c, ldc, i, j, leni, lenj)
+
+				// Compute A_ik B_kj for all k
+				for k := 0; k < maxKLen; k += blockSize {
+					lenk := blockSize
+					if k+lenk > maxKLen {
+						lenk = maxKLen - k
+					}
+					var aSub, bSub []float32
+					if aTrans {
+						aSub = sliceView32(a, lda, k, i, lenk, leni)
+					} else {
+						aSub = sliceView32(a, lda, i, k, leni, lenk)
+					}
+					if bTrans {
+						bSub = sliceView32(b, ldb, j, k, lenj, lenk)
+					} else {
+						bSub = sliceView32(b, ldb, k, j, lenk, lenj)
+					}
+					sgemmSerial(aTrans, bTrans, leni, lenj, lenk, aSub, lda, bSub, ldb, cSub, ldc, alpha)
+				}
+			}(i, j)
+		}
+	}
+}
+
+// sgemmSerial is serial matrix multiply
+func sgemmSerial(aTrans, bTrans bool, m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	switch {
+	case !aTrans && !bTrans:
+		sgemmSerialNotNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && !bTrans:
+		sgemmSerialTransNot(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case !aTrans && bTrans:
+		sgemmSerialNotTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	case aTrans && bTrans:
+		sgemmSerialTransTrans(m, n, k, a, lda, b, ldb, c, ldc, alpha)
+		return
+	default:
+		panic("unreachable")
+	}
+}
+
+// sgemmSerial where neither a nor b are transposed
+func sgemmSerialNotNot(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		ctmp := c[i*ldc : i*ldc+n]
+		for l, v := range a[i*lda : i*lda+k] {
+			tmp := alpha * v
+			if tmp != 0 {
+				f32.AxpyUnitary(tmp, b[l*ldb:l*ldb+n], ctmp)
+			}
+		}
+	}
+}
+
+// sgemmSerial where neither a is transposed and b is not
+func sgemmSerialTransNot(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		btmp := b[l*ldb : l*ldb+n]
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f32.AxpyUnitary(tmp, btmp, ctmp)
+			}
+		}
+	}
+}
+
+// sgemmSerial where neither a is not transposed and b is
+func sgemmSerialNotTrans(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for i := 0; i < m; i++ {
+		atmp := a[i*lda : i*lda+k]
+		ctmp := c[i*ldc : i*ldc+n]
+		for j := 0; j < n; j++ {
+			ctmp[j] += alpha * f32.DotUnitary(atmp, b[j*ldb:j*ldb+k])
+		}
+	}
+}
+
+// sgemmSerial where both are transposed
+func sgemmSerialTransTrans(m, n, k int, a []float32, lda int, b []float32, ldb int, c []float32, ldc int, alpha float32) {
+	// This style is used instead of the literal [i*stride +j]) is used because
+	// approximately 5 times faster as of go 1.3.
+	for l := 0; l < k; l++ {
+		for i, v := range a[l*lda : l*lda+m] {
+			tmp := alpha * v
+			if tmp != 0 {
+				ctmp := c[i*ldc : i*ldc+n]
+				f32.AxpyInc(tmp, b[l:], ctmp, uintptr(n), uintptr(ldb), 1, 0, 0)
+			}
+		}
+	}
+}
+
+func sliceView32(a []float32, lda, i, j, r, c int) []float32 {
+	return a[i*lda+j : (i+r-1)*lda+j+c]
+}
--- a/vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash
+++ b/vendor/gonum.org/v1/gonum/blas/gonum/single_precision.bash
@@ -0,0 +1,224 @@
+#!/usr/bin/env bash
+
+# Copyright ©2015 The Gonum Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+WARNINGF32='//\
+// Float32 implementations are autogenerated and not directly tested.\
+'
+WARNINGC64='//\
+// Complex64 implementations are autogenerated and not directly tested.\
+'
+
+# Level1 routines.
+
+echo Generating level1float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32.go
+cat level1float64.go \
+| gofmt -r 'blas.Float64Level1 -> blas.Float32Level1' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'blas.DrotmParams -> blas.SrotmParams' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.L2NormInc -> f32.L2NormInc' \
+| gofmt -r 'f64.L2NormUnitary -> f32.L2NormUnitary' \
+| gofmt -r 'f64.ScalInc -> f32.ScalInc' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e "s_^\(func (Implementation) \)Id\(.*\)\$_$WARNINGF32\1Is\2_" \
+      -e 's_^// Id_// Is_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+      -e 's_safmin = 0x1p-1022_safmin = 0x1p-126_' \
+>> level1float32.go
+
+echo Generating level1cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1cmplx64.go
+cat level1cmplx128.go \
+| gofmt -r 'blas.Complex128Level1 -> blas.Complex64Level1' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'complex128 -> complex64' \
+\
+| gofmt -r 'c128.AxpyInc -> c64.AxpyInc' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotcInc -> c64.DotcInc' \
+| gofmt -r 'c128.DotcUnitary -> c64.DotcUnitary' \
+| gofmt -r 'c128.DotuInc -> c64.DotuInc' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+| gofmt -r 'c128.ScalInc -> c64.ScalInc' \
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+| gofmt -r 'dcabs1 -> scabs1' \
+\
+| sed -e "s_^\(func (Implementation) \)Zdot\(.*\)\$_$WARNINGC64\1Cdot\2_" \
+      -e 's_^// Zdot_// Cdot_' \
+      -e "s_^\(func (Implementation) \)Zdscal\(.*\)\$_$WARNINGC64\1Csscal\2_" \
+      -e 's_^// Zdscal_// Csscal_' \
+      -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e "s_^\(func (Implementation) \)Iz\(.*\)\$_$WARNINGC64\1Ic\2_" \
+      -e 's_^// Iz_// Ic_' \
+      -e "s_^\(func (Implementation) \)Dz\(.*\)\$_$WARNINGC64\1Sc\2_" \
+      -e 's_^// Dz_// Sc_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math"_math "gonum.org/v1/gonum/internal/math32"_' \
+>> level1cmplx64.go
+
+echo Generating level1float32_sdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_sdot.go
+cat level1float64_ddot.go \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.DotInc -> f32.DotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_sdot.go
+
+echo Generating level1float32_dsdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_dsdot.go
+cat level1float64_ddot.go \
+| gofmt -r '[]float64 -> []float32' \
+\
+| gofmt -r 'f64.DotInc -> f32.DdotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DdotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1Ds\2_" \
+      -e 's_^// D_// Ds_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_dsdot.go
+
+echo Generating level1float32_sdsdot.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level1float32_sdsdot.go
+cat level1float64_ddot.go \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.DotInc(x, y, f(n), f(incX), f(incY), f(ix), f(iy)) -> alpha + float32(f32.DdotInc(x, y, f(n), f(incX), f(incY), f(ix), f(iy)))' \
+| gofmt -r 'f64.DotUnitary(a, b) -> alpha + float32(f32.DdotUnitary(a, b))' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1Sds\2_" \
+      -e 's_^// D\(.*\)$_// Sds\1 plus a constant_' \
+      -e 's_\\sum_alpha + \\sum_' \
+      -e 's/n int/n int, alpha float32/' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level1float32_sdsdot.go
+
+
+# Level2 routines.
+
+echo Generating level2float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level2float32.go
+cat level2float64.go \
+| gofmt -r 'blas.Float64Level2 -> blas.Float32Level2' \
+\
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyIncTo -> f32.AxpyIncTo' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.AxpyUnitaryTo -> f32.AxpyUnitaryTo' \
+| gofmt -r 'f64.DotInc -> f32.DotInc' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.ScalInc -> f32.ScalInc' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+| gofmt -r 'f64.Ger -> f32.Ger' \
+| gofmt -r 'f64.GemvN -> f32.GemvN' \
+| gofmt -r 'f64.GemvT -> f32.GemvT' \
+| gofmt -r 'Implementation{}.Dscal -> Implementation{}.Sscal' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level2float32.go
+
+echo Generating level2cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level2cmplx64.go
+cat level2cmplx128.go \
+| gofmt -r 'blas.Complex128Level2 -> blas.Complex64Level2' \
+\
+| gofmt -r 'complex128 -> complex64' \
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'c128.AxpyInc -> c64.AxpyInc' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotuInc -> c64.DotuInc' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+| gofmt -r 'c128.ScalInc -> c64.ScalInc' \
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math/cmplx"_cmplx "gonum.org/v1/gonum/internal/cmplx64"_' \
+>> level2cmplx64.go
+
+# Level3 routines.
+
+echo Generating level3float32.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level3float32.go
+cat level3float64.go \
+| gofmt -r 'blas.Float64Level3 -> blas.Float32Level3' \
+\
+| gofmt -r 'float64 -> float32' \
+\
+| gofmt -r 'f64.AxpyUnitaryTo -> f32.AxpyUnitaryTo' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+| gofmt -r 'f64.ScalUnitary -> f32.ScalUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> level3float32.go
+
+echo Generating sgemm.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > sgemm.go
+cat dgemm.go \
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'sliceView64 -> sliceView32' \
+\
+| gofmt -r 'dgemmParallel -> sgemmParallel' \
+| gofmt -r 'computeNumBlocks64 -> computeNumBlocks32' \
+| gofmt -r 'dgemmSerial -> sgemmSerial' \
+| gofmt -r 'dgemmSerialNotNot -> sgemmSerialNotNot' \
+| gofmt -r 'dgemmSerialTransNot -> sgemmSerialTransNot' \
+| gofmt -r 'dgemmSerialNotTrans -> sgemmSerialNotTrans' \
+| gofmt -r 'dgemmSerialTransTrans -> sgemmSerialTransTrans' \
+\
+| gofmt -r 'f64.AxpyInc -> f32.AxpyInc' \
+| gofmt -r 'f64.AxpyUnitary -> f32.AxpyUnitary' \
+| gofmt -r 'f64.DotUnitary -> f32.DotUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)D\(.*\)\$_$WARNINGF32\1S\2_" \
+      -e 's_^// D_// S_' \
+      -e 's_^// d_// s_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/f64"_"gonum.org/v1/gonum/internal/asm/f32"_' \
+>> sgemm.go
+
+echo Generating level3cmplx64.go
+echo -e '// Code generated by "go generate gonum.org/v1/gonum/blas/gonum”; DO NOT EDIT.\n' > level3cmplx64.go
+cat level3cmplx128.go \
+| gofmt -r 'blas.Complex128Level3 -> blas.Complex64Level3' \
+\
+| gofmt -r 'float64 -> float32' \
+| gofmt -r 'complex128 -> complex64' \
+\
+| gofmt -r 'c128.ScalUnitary -> c64.ScalUnitary' \
+| gofmt -r 'c128.DscalUnitary -> c64.SscalUnitary' \
+| gofmt -r 'c128.DotcUnitary -> c64.DotcUnitary' \
+| gofmt -r 'c128.AxpyUnitary -> c64.AxpyUnitary' \
+| gofmt -r 'c128.DotuUnitary -> c64.DotuUnitary' \
+\
+| sed -e "s_^\(func (Implementation) \)Z\(.*\)\$_$WARNINGC64\1C\2_" \
+      -e 's_^// Z_// C_' \
+      -e 's_"gonum.org/v1/gonum/internal/asm/c128"_"gonum.org/v1/gonum/internal/asm/c64"_' \
+      -e 's_"math/cmplx"_cmplx "gonum.org/v1/gonum/internal/cmplx64"_' \
+>> level3cmplx64.go
--- a/vendor/gonum.org/v1/gonum/floats/README.md
+++ b/vendor/gonum.org/v1/gonum/floats/README.md
@@ -0,0 +1,7 @@
+# Gonum floats
+
+[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/floats)](https://pkg.go.dev/gonum.org/v1/gonum/floats)
+[![GoDoc](https://godocs.io/gonum.org/v1/gonum/floats?status.svg)](https://godocs.io/gonum.org/v1/gonum/floats)
+
+Package floats provides a set of helper routines for dealing with slices of float64.
+The functions avoid allocations to allow for use within tight loops without garbage collection overhead.
--- a/vendor/gonum.org/v1/gonum/floats/doc.go
+++ b/vendor/gonum.org/v1/gonum/floats/doc.go
@@ -0,0 +1,11 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package floats provides a set of helper routines for dealing with slices
+// of float64. The functions avoid allocations to allow for use within tight
+// loops without garbage collection overhead.
+//
+// The convention used is that when a slice is being modified in place, it has
+// the name dst.
+package floats // import "gonum.org/v1/gonum/floats"
--- a/vendor/gonum.org/v1/gonum/floats/floats.go
+++ b/vendor/gonum.org/v1/gonum/floats/floats.go
@@ -0,0 +1,807 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package floats
+
+import (
+	"errors"
+	"math"
+	"sort"
+
+	"gonum.org/v1/gonum/floats/scalar"
+	"gonum.org/v1/gonum/internal/asm/f64"
+)
+
+const (
+	zeroLength   = "floats: zero length slice"
+	shortSpan    = "floats: slice length less than 2"
+	badLength    = "floats: slice lengths do not match"
+	badDstLength = "floats: destination slice length does not match input"
+)
+
+// Add adds, element-wise, the elements of s and dst, and stores the result in dst.
+// It panics if the argument lengths do not match.
+func Add(dst, s []float64) {
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	f64.AxpyUnitaryTo(dst, 1, s, dst)
+}
+
+// AddTo adds, element-wise, the elements of s and t and
+// stores the result in dst.
+// It panics if the argument lengths do not match.
+func AddTo(dst, s, t []float64) []float64 {
+	if len(s) != len(t) {
+		panic(badLength)
+	}
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	f64.AxpyUnitaryTo(dst, 1, s, t)
+	return dst
+}
+
+// AddConst adds the scalar c to all of the values in dst.
+func AddConst(c float64, dst []float64) {
+	f64.AddConst(c, dst)
+}
+
+// AddScaled performs dst = dst + alpha * s.
+// It panics if the slice argument lengths do not match.
+func AddScaled(dst []float64, alpha float64, s []float64) {
+	if len(dst) != len(s) {
+		panic(badLength)
+	}
+	f64.AxpyUnitaryTo(dst, alpha, s, dst)
+}
+
+// AddScaledTo performs dst = y + alpha * s, where alpha is a scalar,
+// and dst, y and s are all slices.
+// It panics if the slice argument lengths do not match.
+//
+// At the return of the function, dst[i] = y[i] + alpha * s[i]
+func AddScaledTo(dst, y []float64, alpha float64, s []float64) []float64 {
+	if len(s) != len(y) {
+		panic(badLength)
+	}
+	if len(dst) != len(y) {
+		panic(badDstLength)
+	}
+	f64.AxpyUnitaryTo(dst, alpha, s, y)
+	return dst
+}
+
+// argsort is a helper that implements sort.Interface, as used by
+// Argsort and ArgsortStable.
+type argsort struct {
+	s    []float64
+	inds []int
+}
+
+func (a argsort) Len() int {
+	return len(a.s)
+}
+
+func (a argsort) Less(i, j int) bool {
+	return a.s[i] < a.s[j]
+}
+
+func (a argsort) Swap(i, j int) {
+	a.s[i], a.s[j] = a.s[j], a.s[i]
+	a.inds[i], a.inds[j] = a.inds[j], a.inds[i]
+}
+
+// Argsort sorts the elements of dst while tracking their original order.
+// At the conclusion of Argsort, dst will contain the original elements of dst
+// but sorted in increasing order, and inds will contain the original position
+// of the elements in the slice such that dst[i] = origDst[inds[i]].
+// It panics if the argument lengths do not match.
+func Argsort(dst []float64, inds []int) {
+	if len(dst) != len(inds) {
+		panic(badDstLength)
+	}
+	for i := range dst {
+		inds[i] = i
+	}
+
+	a := argsort{s: dst, inds: inds}
+	sort.Sort(a)
+}
+
+// ArgsortStable sorts the elements of dst while tracking their original order and
+// keeping the original order of equal elements. At the conclusion of ArgsortStable,
+// dst will contain the original elements of dst but sorted in increasing order,
+// and inds will contain the original position of the elements in the slice such
+// that dst[i] = origDst[inds[i]].
+// It panics if the argument lengths do not match.
+func ArgsortStable(dst []float64, inds []int) {
+	if len(dst) != len(inds) {
+		panic(badDstLength)
+	}
+	for i := range dst {
+		inds[i] = i
+	}
+
+	a := argsort{s: dst, inds: inds}
+	sort.Stable(a)
+}
+
+// Count applies the function f to every element of s and returns the number
+// of times the function returned true.
+func Count(f func(float64) bool, s []float64) int {
+	var n int
+	for _, val := range s {
+		if f(val) {
+			n++
+		}
+	}
+	return n
+}
+
+// CumProd finds the cumulative product of the first i elements in
+// s and puts them in place into the ith element of the
+// destination dst.
+// It panics if the argument lengths do not match.
+//
+// At the return of the function, dst[i] = s[i] * s[i-1] * s[i-2] * ...
+func CumProd(dst, s []float64) []float64 {
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	if len(dst) == 0 {
+		return dst
+	}
+	return f64.CumProd(dst, s)
+}
+
+// CumSum finds the cumulative sum of the first i elements in
+// s and puts them in place into the ith element of the
+// destination dst.
+// It panics if the argument lengths do not match.
+//
+// At the return of the function, dst[i] = s[i] + s[i-1] + s[i-2] + ...
+func CumSum(dst, s []float64) []float64 {
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	if len(dst) == 0 {
+		return dst
+	}
+	return f64.CumSum(dst, s)
+}
+
+// Distance computes the L-norm of s - t. See Norm for special cases.
+// It panics if the slice argument lengths do not match.
+func Distance(s, t []float64, L float64) float64 {
+	if len(s) != len(t) {
+		panic(badLength)
+	}
+	if len(s) == 0 {
+		return 0
+	}
+	if L == 2 {
+		return f64.L2DistanceUnitary(s, t)
+	}
+	var norm float64
+	if L == 1 {
+		for i, v := range s {
+			norm += math.Abs(t[i] - v)
+		}
+		return norm
+	}
+	if math.IsInf(L, 1) {
+		for i, v := range s {
+			absDiff := math.Abs(t[i] - v)
+			if absDiff > norm {
+				norm = absDiff
+			}
+		}
+		return norm
+	}
+	for i, v := range s {
+		norm += math.Pow(math.Abs(t[i]-v), L)
+	}
+	return math.Pow(norm, 1/L)
+}
+
+// Div performs element-wise division dst / s
+// and stores the value in dst.
+// It panics if the argument lengths do not match.
+func Div(dst, s []float64) {
+	if len(dst) != len(s) {
+		panic(badLength)
+	}
+	f64.Div(dst, s)
+}
+
+// DivTo performs element-wise division s / t
+// and stores the value in dst.
+// It panics if the argument lengths do not match.
+func DivTo(dst, s, t []float64) []float64 {
+	if len(s) != len(t) {
+		panic(badLength)
+	}
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	return f64.DivTo(dst, s, t)
+}
+
+// Dot computes the dot product of s1 and s2, i.e.
+// sum_{i = 1}^N s1[i]*s2[i].
+// It panics if the argument lengths do not match.
+func Dot(s1, s2 []float64) float64 {
+	if len(s1) != len(s2) {
+		panic(badLength)
+	}
+	return f64.DotUnitary(s1, s2)
+}
+
+// Equal returns true when the slices have equal lengths and
+// all elements are numerically identical.
+func Equal(s1, s2 []float64) bool {
+	if len(s1) != len(s2) {
+		return false
+	}
+	for i, val := range s1 {
+		if s2[i] != val {
+			return false
+		}
+	}
+	return true
+}
+
+// EqualApprox returns true when the slices have equal lengths and
+// all element pairs have an absolute tolerance less than tol or a
+// relative tolerance less than tol.
+func EqualApprox(s1, s2 []float64, tol float64) bool {
+	if len(s1) != len(s2) {
+		return false
+	}
+	for i, a := range s1 {
+		if !scalar.EqualWithinAbsOrRel(a, s2[i], tol, tol) {
+			return false
+		}
+	}
+	return true
+}
+
+// EqualFunc returns true when the slices have the same lengths
+// and the function returns true for all element pairs.
+func EqualFunc(s1, s2 []float64, f func(float64, float64) bool) bool {
+	if len(s1) != len(s2) {
+		return false
+	}
+	for i, val := range s1 {
+		if !f(val, s2[i]) {
+			return false
+		}
+	}
+	return true
+}
+
+// EqualLengths returns true when all of the slices have equal length,
+// and false otherwise. It also returns true when there are no input slices.
+func EqualLengths(slices ...[]float64) bool {
+	// This length check is needed: http://play.golang.org/p/sdty6YiLhM
+	if len(slices) == 0 {
+		return true
+	}
+	l := len(slices[0])
+	for i := 1; i < len(slices); i++ {
+		if len(slices[i]) != l {
+			return false
+		}
+	}
+	return true
+}
+
+// Find applies f to every element of s and returns the indices of the first
+// k elements for which the f returns true, or all such elements
+// if k < 0.
+// Find will reslice inds to have 0 length, and will append
+// found indices to inds.
+// If k > 0 and there are fewer than k elements in s satisfying f,
+// all of the found elements will be returned along with an error.
+// At the return of the function, the input inds will be in an undetermined state.
+func Find(inds []int, f func(float64) bool, s []float64, k int) ([]int, error) {
+	// inds is also returned to allow for calling with nil.
+
+	// Reslice inds to have zero length.
+	inds = inds[:0]
+
+	// If zero elements requested, can just return.
+	if k == 0 {
+		return inds, nil
+	}
+
+	// If k < 0, return all of the found indices.
+	if k < 0 {
+		for i, val := range s {
+			if f(val) {
+				inds = append(inds, i)
+			}
+		}
+		return inds, nil
+	}
+
+	// Otherwise, find the first k elements.
+	nFound := 0
+	for i, val := range s {
+		if f(val) {
+			inds = append(inds, i)
+			nFound++
+			if nFound == k {
+				return inds, nil
+			}
+		}
+	}
+	// Finished iterating over the loop, which means k elements were not found.
+	return inds, errors.New("floats: insufficient elements found")
+}
+
+// HasNaN returns true when the slice s has any values that are NaN and false
+// otherwise.
+func HasNaN(s []float64) bool {
+	for _, v := range s {
+		if math.IsNaN(v) {
+			return true
+		}
+	}
+	return false
+}
+
+// LogSpan returns a set of n equally spaced points in log space between,
+// l and u where N is equal to len(dst). The first element of the
+// resulting dst will be l and the final element of dst will be u.
+// It panics if the length of dst is less than 2.
+// Note that this call will return NaNs if either l or u are negative, and
+// will return all zeros if l or u is zero.
+// Also returns the mutated slice dst, so that it can be used in range, like:
+//
+//	for i, x := range LogSpan(dst, l, u) { ... }
+func LogSpan(dst []float64, l, u float64) []float64 {
+	Span(dst, math.Log(l), math.Log(u))
+	for i := range dst {
+		dst[i] = math.Exp(dst[i])
+	}
+	return dst
+}
+
+// LogSumExp returns the log of the sum of the exponentials of the values in s.
+// Panics if s is an empty slice.
+func LogSumExp(s []float64) float64 {
+	// Want to do this in a numerically stable way which avoids
+	// overflow and underflow
+	// First, find the maximum value in the slice.
+	maxval := Max(s)
+	if math.IsInf(maxval, 0) {
+		// If it's infinity either way, the logsumexp will be infinity as well
+		// returning now avoids NaNs
+		return maxval
+	}
+	var lse float64
+	// Compute the sumexp part
+	for _, val := range s {
+		lse += math.Exp(val - maxval)
+	}
+	// Take the log and add back on the constant taken out
+	return math.Log(lse) + maxval
+}
+
+// Max returns the maximum value in the input slice. If the slice is empty, Max will panic.
+func Max(s []float64) float64 {
+	return s[MaxIdx(s)]
+}
+
+// MaxIdx returns the index of the maximum value in the input slice. If several
+// entries have the maximum value, the first such index is returned.
+// It panics if s is zero length.
+func MaxIdx(s []float64) int {
+	if len(s) == 0 {
+		panic(zeroLength)
+	}
+	max := math.NaN()
+	var ind int
+	for i, v := range s {
+		if math.IsNaN(v) {
+			continue
+		}
+		if v > max || math.IsNaN(max) {
+			max = v
+			ind = i
+		}
+	}
+	return ind
+}
+
+// Min returns the minimum value in the input slice.
+// It panics if s is zero length.
+func Min(s []float64) float64 {
+	return s[MinIdx(s)]
+}
+
+// MinIdx returns the index of the minimum value in the input slice. If several
+// entries have the minimum value, the first such index is returned.
+// It panics if s is zero length.
+func MinIdx(s []float64) int {
+	if len(s) == 0 {
+		panic(zeroLength)
+	}
+	min := math.NaN()
+	var ind int
+	for i, v := range s {
+		if math.IsNaN(v) {
+			continue
+		}
+		if v < min || math.IsNaN(min) {
+			min = v
+			ind = i
+		}
+	}
+	return ind
+}
+
+// Mul performs element-wise multiplication between dst
+// and s and stores the value in dst.
+// It panics if the argument lengths do not match.
+func Mul(dst, s []float64) {
+	if len(dst) != len(s) {
+		panic(badLength)
+	}
+	for i, val := range s {
+		dst[i] *= val
+	}
+}
+
+// MulTo performs element-wise multiplication between s
+// and t and stores the value in dst.
+// It panics if the argument lengths do not match.
+func MulTo(dst, s, t []float64) []float64 {
+	if len(s) != len(t) {
+		panic(badLength)
+	}
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	for i, val := range t {
+		dst[i] = val * s[i]
+	}
+	return dst
+}
+
+// NearestIdx returns the index of the element in s
+// whose value is nearest to v. If several such
+// elements exist, the lowest index is returned.
+// It panics if s is zero length.
+func NearestIdx(s []float64, v float64) int {
+	if len(s) == 0 {
+		panic(zeroLength)
+	}
+	switch {
+	case math.IsNaN(v):
+		return 0
+	case math.IsInf(v, 1):
+		return MaxIdx(s)
+	case math.IsInf(v, -1):
+		return MinIdx(s)
+	}
+	var ind int
+	dist := math.NaN()
+	for i, val := range s {
+		newDist := math.Abs(v - val)
+		// A NaN distance will not be closer.
+		if math.IsNaN(newDist) {
+			continue
+		}
+		if newDist < dist || math.IsNaN(dist) {
+			dist = newDist
+			ind = i
+		}
+	}
+	return ind
+}
+
+// NearestIdxForSpan return the index of a hypothetical vector created
+// by Span with length n and bounds l and u whose value is closest
+// to v. That is, NearestIdxForSpan(n, l, u, v) is equivalent to
+// Nearest(Span(make([]float64, n),l,u),v) without an allocation.
+// It panics if n is less than two.
+func NearestIdxForSpan(n int, l, u float64, v float64) int {
+	if n < 2 {
+		panic(shortSpan)
+	}
+	if math.IsNaN(v) {
+		return 0
+	}
+
+	// Special cases for Inf and NaN.
+	switch {
+	case math.IsNaN(l) && !math.IsNaN(u):
+		return n - 1
+	case math.IsNaN(u):
+		return 0
+	case math.IsInf(l, 0) && math.IsInf(u, 0):
+		if l == u {
+			return 0
+		}
+		if n%2 == 1 {
+			if !math.IsInf(v, 0) {
+				return n / 2
+			}
+			if math.Copysign(1, v) == math.Copysign(1, l) {
+				return 0
+			}
+			return n/2 + 1
+		}
+		if math.Copysign(1, v) == math.Copysign(1, l) {
+			return 0
+		}
+		return n / 2
+	case math.IsInf(l, 0):
+		if v == l {
+			return 0
+		}
+		return n - 1
+	case math.IsInf(u, 0):
+		if v == u {
+			return n - 1
+		}
+		return 0
+	case math.IsInf(v, -1):
+		if l <= u {
+			return 0
+		}
+		return n - 1
+	case math.IsInf(v, 1):
+		if u <= l {
+			return 0
+		}
+		return n - 1
+	}
+
+	// Special cases for v outside (l, u) and (u, l).
+	switch {
+	case l < u:
+		if v <= l {
+			return 0
+		}
+		if v >= u {
+			return n - 1
+		}
+	case l > u:
+		if v >= l {
+			return 0
+		}
+		if v <= u {
+			return n - 1
+		}
+	default:
+		return 0
+	}
+
+	// Can't guarantee anything about exactly halfway between
+	// because of floating point weirdness.
+	return int((float64(n)-1)/(u-l)*(v-l) + 0.5)
+}
+
+// Norm returns the L norm of the slice S, defined as
+// (sum_{i=1}^N s[i]^L)^{1/L}
+// Special cases:
+// L = math.Inf(1) gives the maximum absolute value.
+// Does not correctly compute the zero norm (use Count).
+func Norm(s []float64, L float64) float64 {
+	// Should this complain if L is not positive?
+	// Should this be done in log space for better numerical stability?
+	//	would be more cost
+	//	maybe only if L is high?
+	if len(s) == 0 {
+		return 0
+	}
+	if L == 2 {
+		return f64.L2NormUnitary(s)
+	}
+	var norm float64
+	if L == 1 {
+		for _, val := range s {
+			norm += math.Abs(val)
+		}
+		return norm
+	}
+	if math.IsInf(L, 1) {
+		for _, val := range s {
+			norm = math.Max(norm, math.Abs(val))
+		}
+		return norm
+	}
+	for _, val := range s {
+		norm += math.Pow(math.Abs(val), L)
+	}
+	return math.Pow(norm, 1/L)
+}
+
+// Prod returns the product of the elements of the slice.
+// Returns 1 if len(s) = 0.
+func Prod(s []float64) float64 {
+	prod := 1.0
+	for _, val := range s {
+		prod *= val
+	}
+	return prod
+}
+
+// Reverse reverses the order of elements in the slice.
+func Reverse(s []float64) {
+	for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 {
+		s[i], s[j] = s[j], s[i]
+	}
+}
+
+// Same returns true when the input slices have the same length and all
+// elements have the same value with NaN treated as the same.
+func Same(s, t []float64) bool {
+	if len(s) != len(t) {
+		return false
+	}
+	for i, v := range s {
+		w := t[i]
+		if v != w && !(math.IsNaN(v) && math.IsNaN(w)) {
+			return false
+		}
+	}
+	return true
+}
+
+// Scale multiplies every element in dst by the scalar c.
+func Scale(c float64, dst []float64) {
+	if len(dst) > 0 {
+		f64.ScalUnitary(c, dst)
+	}
+}
+
+// ScaleTo multiplies the elements in s by c and stores the result in dst.
+// It panics if the slice argument lengths do not match.
+func ScaleTo(dst []float64, c float64, s []float64) []float64 {
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	if len(dst) > 0 {
+		f64.ScalUnitaryTo(dst, c, s)
+	}
+	return dst
+}
+
+// Span returns a set of N equally spaced points between l and u, where N
+// is equal to the length of the destination. The first element of the destination
+// is l, the final element of the destination is u.
+// It panics if the length of dst is less than 2.
+//
+// Span also returns the mutated slice dst, so that it can be used in range expressions,
+// like:
+//
+//	for i, x := range Span(dst, l, u) { ... }
+func Span(dst []float64, l, u float64) []float64 {
+	n := len(dst)
+	if n < 2 {
+		panic(shortSpan)
+	}
+
+	// Special cases for Inf and NaN.
+	switch {
+	case math.IsNaN(l):
+		for i := range dst[:len(dst)-1] {
+			dst[i] = math.NaN()
+		}
+		dst[len(dst)-1] = u
+		return dst
+	case math.IsNaN(u):
+		for i := range dst[1:] {
+			dst[i+1] = math.NaN()
+		}
+		dst[0] = l
+		return dst
+	case math.IsInf(l, 0) && math.IsInf(u, 0):
+		for i := range dst[:len(dst)/2] {
+			dst[i] = l
+			dst[len(dst)-i-1] = u
+		}
+		if len(dst)%2 == 1 {
+			if l != u {
+				dst[len(dst)/2] = 0
+			} else {
+				dst[len(dst)/2] = l
+			}
+		}
+		return dst
+	case math.IsInf(l, 0):
+		for i := range dst[:len(dst)-1] {
+			dst[i] = l
+		}
+		dst[len(dst)-1] = u
+		return dst
+	case math.IsInf(u, 0):
+		for i := range dst[1:] {
+			dst[i+1] = u
+		}
+		dst[0] = l
+		return dst
+	}
+
+	step := (u - l) / float64(n-1)
+	for i := range dst {
+		dst[i] = l + step*float64(i)
+	}
+	return dst
+}
+
+// Sub subtracts, element-wise, the elements of s from dst.
+// It panics if the argument lengths do not match.
+func Sub(dst, s []float64) {
+	if len(dst) != len(s) {
+		panic(badLength)
+	}
+	f64.AxpyUnitaryTo(dst, -1, s, dst)
+}
+
+// SubTo subtracts, element-wise, the elements of t from s and
+// stores the result in dst.
+// It panics if the argument lengths do not match.
+func SubTo(dst, s, t []float64) []float64 {
+	if len(s) != len(t) {
+		panic(badLength)
+	}
+	if len(dst) != len(s) {
+		panic(badDstLength)
+	}
+	f64.AxpyUnitaryTo(dst, -1, t, s)
+	return dst
+}
+
+// Sum returns the sum of the elements of the slice.
+func Sum(s []float64) float64 {
+	return f64.Sum(s)
+}
+
+// Within returns the first index i where s[i] <= v < s[i+1]. Within panics if:
+//   - len(s) < 2
+//   - s is not sorted
+func Within(s []float64, v float64) int {
+	if len(s) < 2 {
+		panic(shortSpan)
+	}
+	if !sort.Float64sAreSorted(s) {
+		panic("floats: input slice not sorted")
+	}
+	if v < s[0] || v >= s[len(s)-1] || math.IsNaN(v) {
+		return -1
+	}
+	for i, f := range s[1:] {
+		if v < f {
+			return i
+		}
+	}
+	return -1
+}
+
+// SumCompensated returns the sum of the elements of the slice calculated with greater
+// accuracy than Sum at the expense of additional computation.
+func SumCompensated(s []float64) float64 {
+	// SumCompensated uses an improved version of Kahan's compensated
+	// summation algorithm proposed by Neumaier.
+	// See https://en.wikipedia.org/wiki/Kahan_summation_algorithm for details.
+	var sum, c float64
+	for _, x := range s {
+		// This type conversion is here to prevent a sufficiently smart compiler
+		// from optimising away these operations.
+		t := float64(sum + x)
+		if math.Abs(sum) >= math.Abs(x) {
+			c += (sum - t) + x
+		} else {
+			c += (x - t) + sum
+		}
+		sum = t
+	}
+	return sum + c
+}
--- a/vendor/gonum.org/v1/gonum/floats/scalar/doc.go
+++ b/vendor/gonum.org/v1/gonum/floats/scalar/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package scalar provides a set of helper routines for dealing with float64 values.
+package scalar // import "gonum.org/v1/gonum/floats/scalar"
--- a/vendor/gonum.org/v1/gonum/floats/scalar/scalar.go
+++ b/vendor/gonum.org/v1/gonum/floats/scalar/scalar.go
@@ -0,0 +1,171 @@
+// Copyright ©2013 The Gonum Authors. All rights reserved.
+// Use of this code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package scalar
+
+import (
+	"math"
+	"strconv"
+)
+
+// EqualWithinAbs returns true when a and b have an absolute difference
+// not greater than tol.
+func EqualWithinAbs(a, b, tol float64) bool {
+	return a == b || math.Abs(a-b) <= tol
+}
+
+// minNormalFloat64 is the smallest normal number. For 64 bit IEEE-754
+// floats this is 2^{-1022}.
+const minNormalFloat64 = 0x1p-1022
+
+// EqualWithinRel returns true when the difference between a and b
+// is not greater than tol times the greater absolute value of a and b,
+//
+//	abs(a-b) <= tol * max(abs(a), abs(b)).
+func EqualWithinRel(a, b, tol float64) bool {
+	if a == b {
+		return true
+	}
+	delta := math.Abs(a - b)
+	if delta <= minNormalFloat64 {
+		return delta <= tol*minNormalFloat64
+	}
+	// We depend on the division in this relationship to identify
+	// infinities (we rely on the NaN to fail the test) otherwise
+	// we compare Infs of the same sign and evaluate Infs as equal
+	// independent of sign.
+	return delta/math.Max(math.Abs(a), math.Abs(b)) <= tol
+}
+
+// EqualWithinAbsOrRel returns true when a and b are equal to within
+// the absolute or relative tolerances. See EqualWithinAbs and
+// EqualWithinRel for details.
+func EqualWithinAbsOrRel(a, b, absTol, relTol float64) bool {
+	return EqualWithinAbs(a, b, absTol) || EqualWithinRel(a, b, relTol)
+}
+
+// EqualWithinULP returns true when a and b are equal to within
+// the specified number of floating point units in the last place.
+func EqualWithinULP(a, b float64, ulp uint) bool {
+	if a == b {
+		return true
+	}
+	if math.IsNaN(a) || math.IsNaN(b) {
+		return false
+	}
+	if math.Signbit(a) != math.Signbit(b) {
+		return math.Float64bits(math.Abs(a))+math.Float64bits(math.Abs(b)) <= uint64(ulp)
+	}
+	return ulpDiff(math.Float64bits(a), math.Float64bits(b)) <= uint64(ulp)
+}
+
+func ulpDiff(a, b uint64) uint64 {
+	if a > b {
+		return a - b
+	}
+	return b - a
+}
+
+const (
+	nanBits = 0x7ff8000000000000
+	nanMask = 0xfff8000000000000
+)
+
+// NaNWith returns an IEEE 754 "quiet not-a-number" value with the
+// payload specified in the low 51 bits of payload.
+// The NaN returned by math.NaN has a bit pattern equal to NaNWith(1).
+func NaNWith(payload uint64) float64 {
+	return math.Float64frombits(nanBits | (payload &^ nanMask))
+}
+
+// NaNPayload returns the lowest 51 bits payload of an IEEE 754 "quiet
+// not-a-number". For values of f other than quiet-NaN, NaNPayload
+// returns zero and false.
+func NaNPayload(f float64) (payload uint64, ok bool) {
+	b := math.Float64bits(f)
+	if b&nanBits != nanBits {
+		return 0, false
+	}
+	return b &^ nanMask, true
+}
+
+// ParseWithNA converts the string s to a float64 in value.
+// If s equals missing, weight is returned as 0, otherwise 1.
+func ParseWithNA(s, missing string) (value, weight float64, err error) {
+	if s == missing {
+		return 0, 0, nil
+	}
+	value, err = strconv.ParseFloat(s, 64)
+	if err == nil {
+		weight = 1
+	}
+	return value, weight, err
+}
+
+// Round returns the half away from zero rounded value of x with prec precision.
+//
+// Special cases are:
+//
+//	Round(±0) = +0
+//	Round(±Inf) = ±Inf
+//	Round(NaN) = NaN
+func Round(x float64, prec int) float64 {
+	if x == 0 {
+		// Make sure zero is returned
+		// without the negative bit set.
+		return 0
+	}
+	// Fast path for positive precision on integers.
+	if prec >= 0 && x == math.Trunc(x) {
+		return x
+	}
+	pow := math.Pow10(prec)
+	intermed := x * pow
+	if math.IsInf(intermed, 0) {
+		return x
+	}
+	x = math.Round(intermed)
+
+	if x == 0 {
+		return 0
+	}
+
+	return x / pow
+}
+
+// RoundEven returns the half even rounded value of x with prec precision.
+//
+// Special cases are:
+//
+//	RoundEven(±0) = +0
+//	RoundEven(±Inf) = ±Inf
+//	RoundEven(NaN) = NaN
+func RoundEven(x float64, prec int) float64 {
+	if x == 0 {
+		// Make sure zero is returned
+		// without the negative bit set.
+		return 0
+	}
+	// Fast path for positive precision on integers.
+	if prec >= 0 && x == math.Trunc(x) {
+		return x
+	}
+	pow := math.Pow10(prec)
+	intermed := x * pow
+	if math.IsInf(intermed, 0) {
+		return x
+	}
+	x = math.RoundToEven(intermed)
+
+	if x == 0 {
+		return 0
+	}
+
+	return x / pow
+}
+
+// Same returns true when the inputs have the same value, allowing NaN equality.
+func Same(a, b float64) bool {
+	return a == b || (math.IsNaN(a) && math.IsNaN(b))
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyinc_amd64.s
@@ -0,0 +1,134 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ   x_base+16(FP), SI // SI = &x
+	MOVQ   y_base+40(FP), DI // DI = &y
+	MOVQ   n+64(FP), CX      // CX = n
+	CMPQ   CX, $0            // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+88(FP), R8     // R8 = ix  // Load the first index
+	SHLQ   $4, R8            // R8 *= sizeof(complex128)
+	MOVQ   iy+96(FP), R9     // R9 = iy
+	SHLQ   $4, R9            // R9 *= sizeof(complex128)
+	LEAQ   (SI)(R8*1), SI    // SI = &(x[ix])
+	LEAQ   (DI)(R9*1), DI    // DI = &(y[iy])
+	MOVQ   DI, DX            // DX = DI      // Separate Read/Write pointers
+	MOVQ   incX+72(FP), R8   // R8 = incX
+	SHLQ   $4, R8            // R8 *= sizeof(complex128)
+	MOVQ   incY+80(FP), R9   // R9 = iy
+	SHLQ   $4, R9            // R9 *= sizeof(complex128)
+	MOVUPS alpha+0(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPD $0x1, X1, X1      // X1 = { real(a), imag(a) }
+	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX            // CX = n % 4
+	SHRQ   $2, BX            // BX = floor( n / 4 )
+	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVUPS (SI), X2       // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS (SI)(R8*1), X4
+	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVUPS (SI), X6
+	MOVUPS (SI)(R8*1), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX), X3
+	ADDPD  (DX)(R9*1), X5
+	LEAQ   (DX)(R9*2), DX // DX = &(DX[incY*2])
+	ADDPD  (DX), X7
+	ADDPD  (DX)(R9*1), X9
+	MOVUPS X3, (DI)       // dst[i] = X_(i+1)
+	MOVUPS X5, (DI)(R9*1)
+	LEAQ   (DI)(R9*2), DI
+	MOVUPS X7, (DI)
+	MOVUPS X9, (DI)(R9*1)
+	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
+	LEAQ   (DX)(R9*2), DX // DX = &(DX[incY*2])
+	LEAQ   (DI)(R9*2), DI // DI = &(DI[incY*2])
+	DECQ   BX
+	JNZ    axpyi_loop     // } while --BX > 0
+	CMPQ   CX, $0         // if CX == 0 { return }
+	JE     axpyi_end
+
+axpyi_tail: // do {
+	MOVUPS (SI), X2     // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2       // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3       // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DI), X3
+	MOVUPS X3, (DI)   // y[i] = X_i
+	ADDQ   R8, SI     // SI = &(SI[incX])
+	ADDQ   R9, DI     // DI = &(DI[incY])
+	LOOP   axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyincto_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyincto_amd64.s
@@ -0,0 +1,141 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ   dst_base+0(FP), DI // DI = &dst
+	MOVQ   x_base+56(FP), SI  // SI = &x
+	MOVQ   y_base+80(FP), DX  // DX = &y
+	MOVQ   n+104(FP), CX      // CX = n
+	CMPQ   CX, $0             // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+128(FP), R8     // R8 = ix  // Load the first index
+	SHLQ   $4, R8             // R8 *= sizeof(complex128)
+	MOVQ   iy+136(FP), R9     // R9 = iy
+	SHLQ   $4, R9             // R9 *= sizeof(complex128)
+	MOVQ   idst+32(FP), R10   // R10 = idst
+	SHLQ   $4, R10            // R10 *= sizeof(complex128)
+	LEAQ   (SI)(R8*1), SI     // SI = &(x[ix])
+	LEAQ   (DX)(R9*1), DX     // DX = &(y[iy])
+	LEAQ   (DI)(R10*1), DI    // DI = &(dst[idst])
+	MOVQ   incX+112(FP), R8   // R8 = incX
+	SHLQ   $4, R8             // R8 *= sizeof(complex128)
+	MOVQ   incY+120(FP), R9   // R9 = incY
+	SHLQ   $4, R9             // R9 *= sizeof(complex128)
+	MOVQ   incDst+24(FP), R10 // R10 = incDst
+	SHLQ   $4, R10            // R10 *= sizeof(complex128)
+	MOVUPS alpha+40(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPD $0x1, X1, X1       // X1 = { real(a), imag(a) }
+	MOVAPS X0, X10            // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX             // CX = n % 4
+	SHRQ   $2, BX             // BX = floor( n / 4 )
+	JZ     axpyi_tail         // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVUPS (SI), X2       // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS (SI)(R8*1), X4
+	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
+
+	MOVUPS (SI), X6
+	MOVUPS (SI)(R8*1), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX), X3
+	ADDPD  (DX)(R9*1), X5
+	LEAQ   (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	ADDPD  (DX), X7
+	ADDPD  (DX)(R9*1), X9
+	MOVUPS X3, (DI)        // dst[i] = X_(i+1)
+	MOVUPS X5, (DI)(R10*1)
+	LEAQ   (DI)(R10*2), DI
+	MOVUPS X7, (DI)
+	MOVUPS X9, (DI)(R10*1)
+	LEAQ   (SI)(R8*2), SI  // SI = &(SI[incX*2])
+	LEAQ   (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	LEAQ   (DI)(R10*2), DI // DI = &(DI[incDst*2])
+	DECQ   BX
+	JNZ    axpyi_loop      // } while --BX > 0
+	CMPQ   CX, $0          // if CX == 0 { return }
+	JE     axpyi_end
+
+axpyi_tail: // do {
+	MOVUPS (SI), X2     // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2       // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3       // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX), X3
+	MOVUPS X3, (DI)   // y[i] X_(i+1)
+	ADDQ   R8, SI     // SI += incX
+	ADDQ   R9, DX     // DX += incY
+	ADDQ   R10, DI    // DI += incDst
+	LOOP   axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitary_amd64.s
@@ -0,0 +1,122 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitary(alpha complex128, x, y []complex128)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+16(FP), SI // SI = &x
+	MOVQ    y_base+40(FP), DI // DI = &y
+	MOVQ    x_len+24(FP), CX  // CX = min( len(x), len(y) )
+	CMPQ    y_len+48(FP), CX
+	CMOVQLE y_len+48(FP), CX
+	CMPQ    CX, $0            // if CX == 0 { return }
+	JE      caxy_end
+	PXOR    X0, X0            // Clear work registers and cache-align loop
+	PXOR    X1, X1
+	MOVUPS  alpha+0(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPD  $0x1, X1, X1      // X1 = { real(a), imag(a) }
+	XORQ    AX, AX            // i = 0
+	MOVAPS  X0, X10           // Copy X0 and X1 for pipelining
+	MOVAPS  X1, X11
+	MOVQ    CX, BX
+	ANDQ    $3, CX            // CX = n % 4
+	SHRQ    $2, BX            // BX = floor( n / 4 )
+	JZ      caxy_tail         // if BX == 0 { goto caxy_tail }
+
+caxy_loop: // do {
+	MOVUPS (SI)(AX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS 16(SI)(AX*8), X4
+	MOVUPS 32(SI)(AX*8), X6
+	MOVUPS 48(SI)(AX*8), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DI)(AX*8), X3
+	ADDPD  16(DI)(AX*8), X5
+	ADDPD  32(DI)(AX*8), X7
+	ADDPD  48(DI)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i] = X_(i+1)
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX
+	JNZ    caxy_loop        // } while --BX > 0
+	CMPQ   CX, $0           // if CX == 0 { return }
+	JE     caxy_end
+
+caxy_tail: // do {
+	MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3         // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2   // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2         // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3         // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DI)(AX*8), X3
+	MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
+	ADDQ   $2, AX         // i += 2
+	LOOP   caxy_tail      // }  while --CX > 0
+
+caxy_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitaryto_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/axpyunitaryto_amd64.s
@@ -0,0 +1,123 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVDDUP X2, X3
+#define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
+// MOVDDUP X4, X5
+#define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
+// MOVDDUP X6, X7
+#define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
+// MOVDDUP X8, X9
+#define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
+
+// ADDSUBPD X2, X3
+#define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+// ADDSUBPD X4, X5
+#define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+// ADDSUBPD X6, X7
+#define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+// ADDSUBPD X8, X9
+#define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128)
+TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    x_base+40(FP), SI  // SI = &x
+	MOVQ    y_base+64(FP), DX  // DX = &y
+	MOVQ    x_len+48(FP), CX   // CX = min( len(x), len(y), len(dst) )
+	CMPQ    y_len+72(FP), CX
+	CMOVQLE y_len+72(FP), CX
+	CMPQ    dst_len+8(FP), CX
+	CMOVQLE dst_len+8(FP), CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      caxy_end
+	MOVUPS  alpha+24(FP), X0   // X0 = { imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPD  $0x1, X1, X1       // X1 = { real(a), imag(a) }
+	XORQ    AX, AX             // i = 0
+	MOVAPS  X0, X10            // Copy X0 and X1 for pipelining
+	MOVAPS  X1, X11
+	MOVQ    CX, BX
+	ANDQ    $3, CX             // CX = n % 4
+	SHRQ    $2, BX             // BX = floor( n / 4 )
+	JZ      caxy_tail          // if BX == 0 { goto caxy_tail }
+
+caxy_loop: // do {
+	MOVUPS (SI)(AX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS 16(SI)(AX*8), X4
+	MOVUPS 32(SI)(AX*8), X6
+	MOVUPS 48(SI)(AX*8), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X1, X2
+	MULPD X0, X3
+	MULPD X11, X4
+	MULPD X10, X5
+	MULPD X1, X6
+	MULPD X0, X7
+	MULPD X11, X8
+	MULPD X10, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX)(AX*8), X3
+	ADDPD  16(DX)(AX*8), X5
+	ADDPD  32(DX)(AX*8), X7
+	ADDPD  48(DX)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i] = X_(i+1)
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX
+	JNZ    caxy_loop        // } while --BX > 0
+	CMPQ   CX, $0           // if CX == 0 { return }
+	JE     caxy_end
+
+caxy_tail: // Same calculation, but read in values to avoid trampling memory
+	MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3         // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2   // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  X1, X2         // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD  X0, X3         // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	ADDPD  (DX)(AX*8), X3
+	MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
+	ADDQ   $2, AX         // i += 2
+	LOOP   caxy_tail      // }  while --CX > 0
+
+caxy_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package c128 provides complex128 vector primitives.
+package c128 // import "gonum.org/v1/gonum/internal/asm/c128"
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcinc_amd64.s
@@ -0,0 +1,153 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR__X3    LONG $0x1E120FF2 // MOVDDUP (SI), X3
+#define MOVDDUP_XPTR_INCX__X5    LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5
+#define MOVDDUP_XPTR_INCX_2__X7    LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7
+#define MOVDDUP_XPTR_INCx3X__X9    LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9
+
+#define MOVDDUP_8_XPTR__X2    LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2
+#define MOVDDUP_8_XPTR_INCX__X4    LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4
+#define MOVDDUP_8_XPTR_INCX_2__X6    LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6
+#define MOVDDUP_8_XPTR_INCx3X__X8    LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
+TEXT ·DotcInc(SB), NOSPLIT, $0
+	MOVQ   x_base+0(FP), X_PTR       // X_PTR = &x
+	MOVQ   y_base+24(FP), Y_PTR      // Y_PTR = &y
+	MOVQ   n+48(FP), LEN             // LEN = n
+	PXOR   SUM, SUM                  // SUM = 0
+	CMPQ   LEN, $0                   // if LEN == 0 { return }
+	JE     dot_end
+	PXOR   P_SUM, P_SUM              // P_SUM = 0
+	MOVQ   ix+72(FP), INC_X          // INC_X = ix * sizeof(complex128)
+	SHLQ   $4, INC_X
+	MOVQ   iy+80(FP), INC_Y          // INC_Y = iy * sizeof(complex128)
+	SHLQ   $4, INC_Y
+	LEAQ   (X_PTR)(INC_X*1), X_PTR   // X_PTR = &(X_PTR[ix])
+	LEAQ   (Y_PTR)(INC_Y*1), Y_PTR   // Y_PTR = &(Y_PTR[iy])
+	MOVQ   incX+56(FP), INC_X        // INC_X = incX
+	SHLQ   $4, INC_X                 // INC_X *=  sizeof(complex128)
+	MOVQ   incY+64(FP), INC_Y        // INC_Y = incY
+	SHLQ   $4, INC_Y                 // INC_Y *=  sizeof(complex128)
+	MOVSD  $(-1.0), NEG1
+	SHUFPD $0, NEG1, NEG1            // { -1, -1 }
+	MOVQ   LEN, TAIL
+	ANDQ   $3, TAIL                  // TAIL = n % 4
+	SHRQ   $2, LEN                   // LEN = floor( n / 4 )
+	JZ     dot_tail                  // if n <= 4 { goto dot_tail }
+	MOVAPS NEG1, P_NEG1              // Copy NEG1 to P_NEG1 for pipelining
+	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128)
+	LEAQ   (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128)
+
+dot_loop: // do {
+	MOVDDUP_XPTR__X3        // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_XPTR_INCX__X5
+	MOVDDUP_XPTR_INCX_2__X7
+	MOVDDUP_XPTR_INCx3X__X9
+
+	MOVDDUP_8_XPTR__X2        // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_8_XPTR_INCX__X4
+	MOVDDUP_8_XPTR_INCX_2__X6
+	MOVDDUP_8_XPTR_INCx3X__X8
+
+	// X_i = { -imag(x[i]), -imag(x[i]) }
+	MULPD NEG1, X2
+	MULPD P_NEG1, X4
+	MULPD NEG1, X6
+	MULPD P_NEG1, X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR), X10
+	MOVUPS (Y_PTR)(INC_Y*1), X11
+	MOVUPS (Y_PTR)(INC_Y*2), X12
+	MOVUPS (Y_PTR)(INCx3_Y*1), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j     = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
+
+	DECQ  LEN
+	JNZ   dot_loop   // } while --LEN > 0
+	ADDPD P_SUM, SUM // sum += psum
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR__X3      // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_8_XPTR__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  NEG1, X2       // X_i     = { -imag(x[i])          , -imag(x[i])           }
+	MOVUPS (Y_PTR), X10   // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3        // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10 // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2        // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM      // sum += result[i]
+	ADDQ  INC_X, X_PTR // X_PTR += incX
+	ADDQ  INC_Y, Y_PTR // Y_PTR += incY
+	DECQ  TAIL
+	JNZ   dot_tail     // }  while --TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+88(FP)
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotcunitary_amd64.s
@@ -0,0 +1,143 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR_IDX_8__X3    LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3
+#define MOVDDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5
+#define MOVDDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7
+#define MOVDDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9
+
+#define MOVDDUP_XPTR_IIDX_8__X2    LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2
+#define MOVDDUP_16_XPTR_IIDX_8__X4    LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4
+#define MOVDDUP_32_XPTR_IIDX_8__X6    LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6
+#define MOVDDUP_48_XPTR_IIDX_8__X8    LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcUnitary(x, y []complex128) (sum complex128)
+TEXT ·DotcUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	PXOR    SUM, SUM             // sum = 0
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dot_end
+	XORPS   P_SUM, P_SUM         // psum = 0
+	MOVSD   $(-1.0), NEG1
+	SHUFPD  $0, NEG1, NEG1       // { -1, -1 }
+	XORQ    IDX, IDX             // i := 0
+	MOVQ    $1, I_IDX            // j := 1
+	MOVQ    LEN, TAIL
+	ANDQ    $3, TAIL             // TAIL = floor( TAIL / 4 )
+	SHRQ    $2, LEN              // LEN = TAIL % 4
+	JZ      dot_tail             // if LEN == 0 { goto dot_tail }
+
+	MOVAPS NEG1, P_NEG1 // Copy NEG1 to P_NEG1 for pipelining
+
+dot_loop: // do {
+	MOVDDUP_XPTR_IDX_8__X3    // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_16_XPTR_IDX_8__X5
+	MOVDDUP_32_XPTR_IDX_8__X7
+	MOVDDUP_48_XPTR_IDX_8__X9
+
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_16_XPTR_IIDX_8__X4
+	MOVDDUP_32_XPTR_IIDX_8__X6
+	MOVDDUP_48_XPTR_IIDX_8__X8
+
+	// X_i = { -imag(x[i]), -imag(x[i]) }
+	MULPD NEG1, X2
+	MULPD P_NEG1, X4
+	MULPD NEG1, X6
+	MULPD P_NEG1, X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	ADDQ  $8, IDX    // IDX += 8
+	ADDQ  $8, I_IDX  // I_IDX += 8
+	DECQ  LEN
+	JNZ   dot_loop   // } while --LEN > 0
+	ADDPD P_SUM, SUM // sum += psum
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR_IDX_8__X3     // X_(i+1) = {  real(x[i])          ,  real(x[i])           }
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i     = {  imag(x[i])          ,  imag(x[i])           }
+	MULPD  NEG1, X2            // X_i     = { -imag(x[i])          , -imag(x[i])           }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3             // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10      // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2             // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM   // SUM += result[i]
+	ADDQ  $2, IDX   // IDX += 2
+	ADDQ  $2, I_IDX // I_IDX += 2
+	DECQ  TAIL
+	JNZ   dot_tail  // }  while --TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+48(FP)
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuinc_amd64.s
@@ -0,0 +1,141 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR__X3    LONG $0x1E120FF2 // MOVDDUP (SI), X3
+#define MOVDDUP_XPTR_INCX__X5    LONG $0x120F42F2; WORD $0x062C // MOVDDUP (SI)(R8*1), X5
+#define MOVDDUP_XPTR_INCX_2__X7    LONG $0x120F42F2; WORD $0x463C // MOVDDUP (SI)(R8*2), X7
+#define MOVDDUP_XPTR_INCx3X__X9    LONG $0x120F46F2; WORD $0x0E0C // MOVDDUP (SI)(R9*1), X9
+
+#define MOVDDUP_8_XPTR__X2    LONG $0x56120FF2; BYTE $0x08 // MOVDDUP 8(SI), X2
+#define MOVDDUP_8_XPTR_INCX__X4    LONG $0x120F42F2; WORD $0x0664; BYTE $0x08 // MOVDDUP 8(SI)(R8*1), X4
+#define MOVDDUP_8_XPTR_INCX_2__X6    LONG $0x120F42F2; WORD $0x4674; BYTE $0x08 // MOVDDUP 8(SI)(R8*2), X6
+#define MOVDDUP_8_XPTR_INCx3X__X8    LONG $0x120F46F2; WORD $0x0E44; BYTE $0x08 // MOVDDUP 8(SI)(R9*1), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+
+// func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
+TEXT ·DotuInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR       // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR      // Y_PTR = &y
+	MOVQ n+48(FP), LEN             // LEN = n
+	PXOR SUM, SUM                  // sum = 0
+	CMPQ LEN, $0                   // if LEN == 0 { return }
+	JE   dot_end
+	MOVQ ix+72(FP), INC_X          // INC_X = ix * sizeof(complex128)
+	SHLQ $4, INC_X
+	MOVQ iy+80(FP), INC_Y          // INC_Y = iy * sizeof(complex128)
+	SHLQ $4, INC_Y
+	LEAQ (X_PTR)(INC_X*1), X_PTR   // X_PTR = &(X_PTR[ix])
+	LEAQ (Y_PTR)(INC_Y*1), Y_PTR   // Y_PTR = &(Y_PTR[iy])
+	MOVQ incX+56(FP), INC_X        // INC_X = incX
+	SHLQ $4, INC_X                 // INC_X *=  sizeof(complex128)
+	MOVQ incY+64(FP), INC_Y        // INC_Y = incY
+	SHLQ $4, INC_Y                 // INC_Y *=  sizeof(complex128)
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL                  // LEN = LEN % 4
+	SHRQ $2, LEN                   // LEN = floor( LEN / 4 )
+	JZ   dot_tail                  // if LEN <= 4 { goto dot_tail }
+	PXOR P_SUM, P_SUM              // psum = 0
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = 3 * incX * sizeof(complex128)
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = 3 * incY * sizeof(complex128)
+
+dot_loop: // do {
+	MOVDDUP_XPTR__X3        // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_XPTR_INCX__X5
+	MOVDDUP_XPTR_INCX_2__X7
+	MOVDDUP_XPTR_INCx3X__X9
+
+	MOVDDUP_8_XPTR__X2        // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_8_XPTR_INCX__X4
+	MOVDDUP_8_XPTR_INCX_2__X6
+	MOVDDUP_8_XPTR_INCx3X__X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR), X10
+	MOVUPS (Y_PTR)(INC_Y*1), X11
+	MOVUPS (Y_PTR)(INC_Y*2), X12
+	MOVUPS (Y_PTR)(INCx3_Y*1), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j     = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
+
+	DECQ  LEN
+	JNZ   dot_loop   // } while --BX > 0
+	ADDPD P_SUM, SUM // sum += psum
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR__X3      // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_8_XPTR__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MOVUPS (Y_PTR), X10   // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3        // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10 // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2        // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM      // sum += result[i]
+	ADDQ  INC_X, X_PTR // X_PTR += incX
+	ADDQ  INC_Y, Y_PTR // Y_PTR += incY
+	DECQ  TAIL         // --TAIL
+	JNZ   dot_tail     // }  while TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+88(FP)
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dotuunitary_amd64.s
@@ -0,0 +1,130 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVDDUP_XPTR_IDX_8__X3    LONG $0x1C120FF2; BYTE $0xC6 // MOVDDUP (SI)(AX*8), X3
+#define MOVDDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF2; WORD $0x10C6 // MOVDDUP 16(SI)(AX*8), X5
+#define MOVDDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF2; WORD $0x20C6 // MOVDDUP 32(SI)(AX*8), X7
+#define MOVDDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F2; WORD $0xC64C; BYTE $0x30 // MOVDDUP 48(SI)(AX*8), X9
+
+#define MOVDDUP_XPTR_IIDX_8__X2    LONG $0x14120FF2; BYTE $0xD6 // MOVDDUP (SI)(DX*8), X2
+#define MOVDDUP_16_XPTR_IIDX_8__X4    LONG $0x64120FF2; WORD $0x10D6 // MOVDDUP 16(SI)(DX*8), X4
+#define MOVDDUP_32_XPTR_IIDX_8__X6    LONG $0x74120FF2; WORD $0x20D6 // MOVDDUP 32(SI)(DX*8), X6
+#define MOVDDUP_48_XPTR_IIDX_8__X8    LONG $0x120F44F2; WORD $0xD644; BYTE $0x30 // MOVDDUP 48(SI)(DX*8), X8
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+
+// func DotuUnitary(x, y []complex128) (sum complex128)
+TEXT ·DotuUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	PXOR    SUM, SUM             // SUM = 0
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dot_end
+	PXOR    P_SUM, P_SUM         // P_SUM = 0
+	XORQ    IDX, IDX             // IDX = 0
+	MOVQ    $1, DX               // j = 1
+	MOVQ    LEN, TAIL
+	ANDQ    $3, TAIL             // TAIL = floor( LEN / 4 )
+	SHRQ    $2, LEN              // LEN = LEN % 4
+	JZ      dot_tail             // if LEN == 0 { goto dot_tail }
+
+dot_loop: // do {
+	MOVDDUP_XPTR_IDX_8__X3    // X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_16_XPTR_IDX_8__X5
+	MOVDDUP_32_XPTR_IDX_8__X7
+	MOVDDUP_48_XPTR_IDX_8__X9
+
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i = { imag(x[i]), imag(x[i]) }
+	MOVDDUP_16_XPTR_IIDX_8__X4
+	MOVDDUP_32_XPTR_IIDX_8__X6
+	MOVDDUP_48_XPTR_IIDX_8__X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
+	MULPD X10, X3
+	MULPD X11, X5
+	MULPD X12, X7
+	MULPD X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPD $0x1, X10, X10
+	SHUFPD $0x1, X11, X11
+	SHUFPD $0x1, X12, X12
+	SHUFPD $0x1, X13, X13
+
+	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
+	MULPD X10, X2
+	MULPD X11, X4
+	MULPD X12, X6
+	MULPD X13, X8
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	// psum += result[i]
+	ADDPD X3, SUM
+	ADDPD X5, P_SUM
+	ADDPD X7, SUM
+	ADDPD X9, P_SUM
+
+	ADDQ  $8, IDX    // IDX += 8
+	ADDQ  $8, I_IDX  // I_IDX += 8
+	DECQ  LEN
+	JNZ   dot_loop   // } while --LEN > 0
+	ADDPD P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVDDUP_XPTR_IDX_8__X3     // X_(i+1) = { real(x[i]            , real(x[i])            }
+	MOVDDUP_XPTR_IIDX_8__X2    // X_i     = { imag(x[i])           , imag(x[i])            }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j     = {  imag(y[i])          ,  real(y[i])           }
+	MULPD  X10, X3             // X_(i+1) = {  imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	SHUFPD $0x1, X10, X10      // X_j     = {  real(y[i])          ,  imag(y[i])           }
+	MULPD  X10, X2             // X_i     = {  real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDPD X3, SUM   // psum += result[i]
+	ADDQ  $2, IDX   // IDX += 2
+	ADDQ  $2, I_IDX // I_IDX += 2
+	DECQ  TAIL      // --TAIL
+	JNZ   dot_tail  // }  while TAIL > 0
+
+dot_end:
+	MOVUPS SUM, sum+48(FP)
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalinc_amd64.s
@@ -0,0 +1,69 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define TAIL BX
+#define INC R9
+#define INC3 R10
+#define ALPHA X0
+#define ALPHA_2 X1
+
+#define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0
+
+// func DscalInc(alpha float64, x []complex128, n, inc uintptr)
+TEXT ·DscalInc(SB), NOSPLIT, $0
+	MOVQ x_base+8(FP), SRC // SRC = &x
+	MOVQ n+32(FP), LEN     // LEN = n
+	CMPQ LEN, $0           // if LEN == 0 { return }
+	JE   dscal_end
+
+	MOVDDUP_ALPHA             // ALPHA = alpha
+	MOVQ   inc+40(FP), INC    // INC = inc
+	SHLQ   $4, INC            // INC = INC * sizeof(complex128)
+	LEAQ   (INC)(INC*2), INC3 // INC3 = 3 * INC
+	MOVUPS ALPHA, ALPHA_2     // Copy ALPHA and ALPHA_2 for pipelining
+	MOVQ   LEN, TAIL          // TAIL = LEN
+	SHRQ   $2, LEN            // LEN = floor( n / 4 )
+	JZ     dscal_tail         // if LEN == 0 { goto dscal_tail }
+
+dscal_loop: // do {
+	MOVUPS (SRC), X2         // X_i = x[i]
+	MOVUPS (SRC)(INC*1), X3
+	MOVUPS (SRC)(INC*2), X4
+	MOVUPS (SRC)(INC3*1), X5
+
+	MULPD ALPHA, X2   // X_i *= ALPHA
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	MOVUPS X2, (DST)         // x[i] = X_i
+	MOVUPS X3, (DST)(INC*1)
+	MOVUPS X4, (DST)(INC*2)
+	MOVUPS X5, (DST)(INC3*1)
+
+	LEAQ (SRC)(INC*4), SRC // SRC += INC*4
+	DECQ LEN
+	JNZ  dscal_loop        // } while --LEN > 0
+
+dscal_tail:
+	ANDQ $3, TAIL  // TAIL = TAIL % 4
+	JE   dscal_end // if TAIL == 0 { return }
+
+dscal_tail_loop: // do {
+	MOVUPS (SRC), X2       // X_i = x[i]
+	MULPD  ALPHA, X2       // X_i *= ALPHA
+	MOVUPS X2, (DST)       // x[i] = X_i
+	ADDQ   INC, SRC        // SRC += INC
+	DECQ   TAIL
+	JNZ    dscal_tail_loop // } while --TAIL > 0
+
+dscal_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/dscalunitary_amd64.s
@@ -0,0 +1,66 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define IDX AX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_2 X1
+
+#define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x0824 // MOVDDUP 8(SP), X0
+
+// func DscalUnitary(alpha float64, x []complex128)
+TEXT ·DscalUnitary(SB), NOSPLIT, $0
+	MOVQ x_base+8(FP), SRC // SRC = &x
+	MOVQ x_len+16(FP), LEN // LEN = len(x)
+	CMPQ LEN, $0           // if LEN == 0 { return }
+	JE   dscal_end
+
+	MOVDDUP_ALPHA         // ALPHA = alpha
+	XORQ   IDX, IDX       // IDX = 0
+	MOVUPS ALPHA, ALPHA_2 // Copy ALPHA to ALPHA_2 for pipelining
+	MOVQ   LEN, TAIL      // TAIL = LEN
+	SHRQ   $2, LEN        // LEN = floor( n / 4 )
+	JZ     dscal_tail     // if LEN == 0 { goto dscal_tail }
+
+dscal_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2   // X_i = x[i]
+	MOVUPS 16(SRC)(IDX*8), X3
+	MOVUPS 32(SRC)(IDX*8), X4
+	MOVUPS 48(SRC)(IDX*8), X5
+
+	MULPD ALPHA, X2   // X_i *= ALPHA
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	MOVUPS X2, (DST)(IDX*8)   // x[i] = X_i
+	MOVUPS X3, 16(DST)(IDX*8)
+	MOVUPS X4, 32(DST)(IDX*8)
+	MOVUPS X5, 48(DST)(IDX*8)
+
+	ADDQ $8, IDX    // IDX += 8
+	DECQ LEN
+	JNZ  dscal_loop // } while --LEN > 0
+
+dscal_tail:
+	ANDQ $3, TAIL  // TAIL = TAIL % 4
+	JZ   dscal_end // if TAIL == 0 { return }
+
+dscal_tail_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2 // X_i = x[i]
+	MULPD  ALPHA, X2        // X_i *= ALPHA
+	MOVUPS X2, (DST)(IDX*8) // x[i] = X_i
+	ADDQ   $2, IDX          // IDX += 2
+	DECQ   TAIL
+	JNZ    dscal_tail_loop  // } while --TAIL > 0
+
+dscal_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/scal.go
@@ -0,0 +1,33 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c128
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []complex128, alpha complex128, x []complex128) {
+	for i, v := range x {
+		dst[i] = alpha * v
+	}
+}
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []complex128, incDst uintptr, alpha complex128, x []complex128, n, incX uintptr) {
+	var idst, ix uintptr
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha * x[ix]
+		ix += incX
+		idst += incDst
+	}
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/scalUnitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/scalUnitary_amd64.s
@@ -0,0 +1,116 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define IDX AX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_C X1
+#define ALPHA2 X10
+#define ALPHA_C2 X11
+
+#define MOVDDUP_X2_X3    LONG $0xDA120FF2 // MOVDDUP X2, X3
+#define MOVDDUP_X4_X5    LONG $0xEC120FF2 // MOVDDUP X4, X5
+#define MOVDDUP_X6_X7    LONG $0xFE120FF2 // MOVDDUP X6, X7
+#define MOVDDUP_X8_X9    LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+// func ScalUnitary(alpha complex128, x []complex128)
+TEXT ·ScalUnitary(SB), NOSPLIT, $0
+	MOVQ x_base+16(FP), SRC // SRC = &x
+	MOVQ x_len+24(FP), LEN  // LEN = len(x)
+	CMPQ LEN, $0            // if LEN == 0 { return }
+	JE   scal_end
+
+	MOVUPS alpha+0(FP), ALPHA     // ALPHA = { imag(alpha), real(alpha) }
+	MOVAPS ALPHA, ALPHA_C
+	SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }
+
+	XORQ   IDX, IDX          // IDX = 0
+	MOVAPS ALPHA, ALPHA2     // Copy ALPHA and ALPHA_C for pipelining
+	MOVAPS ALPHA_C, ALPHA_C2
+	MOVQ   LEN, TAIL
+	SHRQ   $2, LEN           // LEN = floor( n / 4 )
+	JZ     scal_tail         // if BX == 0 { goto scal_tail }
+
+scal_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS 16(SRC)(IDX*8), X4
+	MOVUPS 32(SRC)(IDX*8), X6
+	MOVUPS 48(SRC)(IDX*8), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+	MULPD ALPHA_C, X2
+	MULPD ALPHA, X3
+	MULPD ALPHA_C2, X4
+	MULPD ALPHA2, X5
+	MULPD ALPHA_C, X6
+	MULPD ALPHA, X7
+	MULPD ALPHA_C2, X8
+	MULPD ALPHA2, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	MOVUPS X3, (DST)(IDX*8)   // x[i] = X_(i+1)
+	MOVUPS X5, 16(DST)(IDX*8)
+	MOVUPS X7, 32(DST)(IDX*8)
+	MOVUPS X9, 48(DST)(IDX*8)
+	ADDQ   $8, IDX            // IDX += 8
+	DECQ   LEN
+	JNZ    scal_loop          // } while --LEN > 0
+
+scal_tail:
+	ANDQ $3, TAIL // TAIL = TAIL % 4
+	JZ   scal_end // if TAIL == 0 { return }
+
+scal_tail_loop: // do {
+	MOVUPS (SRC)(IDX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3           // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2     // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  ALPHA_C, X2      // X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	MULPD  ALPHA, X3        // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	MOVUPS X3, (DST)(IDX*8) // x[i] = X_(i+1)
+	ADDQ   $2, IDX          // IDX += 2
+	DECQ   TAIL
+	JNZ    scal_tail_loop   // }  while --LEN > 0
+
+scal_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/scalinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/scalinc_amd64.s
@@ -0,0 +1,121 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SRC SI
+#define DST SI
+#define LEN CX
+#define TAIL BX
+#define INC R9
+#define INC3 R10
+#define ALPHA X0
+#define ALPHA_C X1
+#define ALPHA2 X10
+#define ALPHA_C2 X11
+
+#define MOVDDUP_X2_X3    LONG $0xDA120FF2 // MOVDDUP X2, X3
+#define MOVDDUP_X4_X5    LONG $0xEC120FF2 // MOVDDUP X4, X5
+#define MOVDDUP_X6_X7    LONG $0xFE120FF2 // MOVDDUP X6, X7
+#define MOVDDUP_X8_X9    LONG $0x120F45F2; BYTE $0xC8 // MOVDDUP X8, X9
+
+#define ADDSUBPD_X2_X3    LONG $0xDAD00F66 // ADDSUBPD X2, X3
+#define ADDSUBPD_X4_X5    LONG $0xECD00F66 // ADDSUBPD X4, X5
+#define ADDSUBPD_X6_X7    LONG $0xFED00F66 // ADDSUBPD X6, X7
+#define ADDSUBPD_X8_X9    LONG $0xD00F4566; BYTE $0xC8 // ADDSUBPD X8, X9
+
+// func ScalInc(alpha complex128, x []complex128, n, inc uintptr)
+TEXT ·ScalInc(SB), NOSPLIT, $0
+	MOVQ x_base+16(FP), SRC // SRC = &x
+	MOVQ n+40(FP), LEN      // LEN = len(x)
+	CMPQ LEN, $0
+	JE   scal_end           // if LEN == 0 { return }
+
+	MOVQ inc+48(FP), INC    // INC = inc
+	SHLQ $4, INC            // INC = INC * sizeof(complex128)
+	LEAQ (INC)(INC*2), INC3 // INC3 = 3 * INC
+
+	MOVUPS alpha+0(FP), ALPHA     // ALPHA = { imag(alpha), real(alpha) }
+	MOVAPS ALPHA, ALPHA_C
+	SHUFPD $0x1, ALPHA_C, ALPHA_C // ALPHA_C = { real(alpha), imag(alpha) }
+
+	MOVAPS ALPHA, ALPHA2     // Copy ALPHA and ALPHA_C for pipelining
+	MOVAPS ALPHA_C, ALPHA_C2
+	MOVQ   LEN, TAIL
+	SHRQ   $2, LEN           // LEN = floor( n / 4 )
+	JZ     scal_tail         // if BX == 0 { goto scal_tail }
+
+scal_loop: // do {
+	MOVUPS (SRC), X2         // X_i = { imag(x[i]), real(x[i]) }
+	MOVUPS (SRC)(INC*1), X4
+	MOVUPS (SRC)(INC*2), X6
+	MOVUPS (SRC)(INC3*1), X8
+
+	// X_(i+1) = { real(x[i], real(x[i]) }
+	MOVDDUP_X2_X3
+	MOVDDUP_X4_X5
+	MOVDDUP_X6_X7
+	MOVDDUP_X8_X9
+
+	// X_i = { imag(x[i]), imag(x[i]) }
+	SHUFPD $0x3, X2, X2
+	SHUFPD $0x3, X4, X4
+	SHUFPD $0x3, X6, X6
+	SHUFPD $0x3, X8, X8
+
+	// X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	// X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+	MULPD ALPHA_C, X2
+	MULPD ALPHA, X3
+	MULPD ALPHA_C2, X4
+	MULPD ALPHA2, X5
+	MULPD ALPHA_C, X6
+	MULPD ALPHA, X7
+	MULPD ALPHA_C2, X8
+	MULPD ALPHA2, X9
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+	ADDSUBPD_X4_X5
+	ADDSUBPD_X6_X7
+	ADDSUBPD_X8_X9
+
+	MOVUPS X3, (DST)         // x[i] = X_(i+1)
+	MOVUPS X5, (DST)(INC*1)
+	MOVUPS X7, (DST)(INC*2)
+	MOVUPS X9, (DST)(INC3*1)
+
+	LEAQ (SRC)(INC*4), SRC // SRC = &(SRC[inc*4])
+	DECQ LEN
+	JNZ  scal_loop         // } while --BX > 0
+
+scal_tail:
+	ANDQ $3, TAIL // TAIL = TAIL % 4
+	JE   scal_end // if TAIL == 0 { return }
+
+scal_tail_loop: // do {
+	MOVUPS (SRC), X2    // X_i = { imag(x[i]), real(x[i]) }
+	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
+	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
+	MULPD  ALPHA_C, X2  // X_i     = { real(ALPHA) * imag(x[i]), imag(ALPHA) * imag(x[i])  }
+	MULPD  ALPHA, X3    // X_(i+1) = { imag(ALPHA) * real(x[i]), real(ALPHA) * real(x[i])  }
+
+	// X_(i+1) = {
+	//	imag(result[i]):  imag(ALPHA)*real(x[i]) + real(ALPHA)*imag(x[i]),
+	//	real(result[i]):  real(ALPHA)*real(x[i]) - imag(ALPHA)*imag(x[i])
+	//  }
+	ADDSUBPD_X2_X3
+
+	MOVUPS X3, (DST)      // x[i] = X_i
+	ADDQ   INC, SRC       // SRC = &(SRC[incX])
+	DECQ   TAIL
+	JNZ    scal_tail_loop // } while --TAIL > 0
+
+scal_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs.go
@@ -0,0 +1,180 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c128
+
+import (
+	"math"
+	"math/cmplx"
+)
+
+// Add is
+//
+//	for i, v := range s {
+//		dst[i] += v
+//	}
+func Add(dst, s []complex128) {
+	for i, v := range s {
+		dst[i] += v
+	}
+}
+
+// AddConst is
+//
+//	for i := range x {
+//		x[i] += alpha
+//	}
+func AddConst(alpha complex128, x []complex128) {
+	for i := range x {
+		x[i] += alpha
+	}
+}
+
+// CumSum is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] + v
+//	}
+//	return dst
+func CumSum(dst, s []complex128) []complex128 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] + v
+	}
+	return dst
+}
+
+// CumProd is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] * v
+//	}
+//	return dst
+func CumProd(dst, s []complex128) []complex128 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] * v
+	}
+	return dst
+}
+
+// Div is
+//
+//	for i, v := range s {
+//		dst[i] /= v
+//	}
+func Div(dst, s []complex128) {
+	for i, v := range s {
+		dst[i] /= v
+	}
+}
+
+// DivTo is
+//
+//	for i, v := range s {
+//		dst[i] = v / t[i]
+//	}
+//	return dst
+func DivTo(dst, s, t []complex128) []complex128 {
+	for i, v := range s {
+		dst[i] = v / t[i]
+	}
+	return dst
+}
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += cmplx.Conj(v) * y[i]
+//	}
+//	return sum
+func DotUnitary(x, y []complex128) (sum complex128) {
+	for i, v := range x {
+		sum += cmplx.Conj(v) * y[i]
+	}
+	return sum
+}
+
+// L2DistanceUnitary returns the L2-norm of x-y.
+func L2DistanceUnitary(x, y []complex128) (norm float64) {
+	var scale float64
+	sumSquares := 1.0
+	for i, v := range x {
+		v -= y[i]
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx.Abs(v)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
+
+// L2NormUnitary returns the L2-norm of x.
+func L2NormUnitary(x []complex128) (norm float64) {
+	var scale float64
+	sumSquares := 1.0
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx.Abs(v)
+		if math.IsNaN(absxi) {
+			return math.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math.IsInf(scale, 1) {
+		return math.Inf(1)
+	}
+	return scale * math.Sqrt(sumSquares)
+}
+
+// Sum is
+//
+//	var sum complex128
+//	for i := range x {
+//	    sum += x[i]
+//	}
+func Sum(x []complex128) complex128 {
+	var sum complex128
+	for _, v := range x {
+		sum += v
+	}
+	return sum
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_amd64.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_amd64.go
@@ -0,0 +1,109 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package c128
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex128, x, y []complex128)
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128)
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
+
+// DscalUnitary is
+//
+//	for i, v := range x {
+//		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+//	}
+func DscalUnitary(alpha float64, x []complex128)
+
+// DscalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+//		ix += inc
+//	}
+func DscalInc(alpha float64, x []complex128, n, inc uintptr)
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha complex128, x []complex128, n, inc uintptr)
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha complex128, x []complex128)
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * cmplx.Conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex128) (sum complex128)
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * cmplx.Conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex128) (sum complex128)
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128)
--- a/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_noasm.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c128/stubs_noasm.go
@@ -0,0 +1,176 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package c128
+
+import "math/cmplx"
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex128, x, y []complex128) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
+
+// DscalUnitary is
+//
+//	for i, v := range x {
+//		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+//	}
+func DscalUnitary(alpha float64, x []complex128) {
+	for i, v := range x {
+		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+	}
+}
+
+// DscalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+//		ix += inc
+//	}
+func DscalInc(alpha float64, x []complex128, n, inc uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+		ix += inc
+	}
+}
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha complex128, x []complex128, n, inc uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] *= alpha
+		ix += inc
+	}
+}
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha complex128, x []complex128) {
+	for i := range x {
+		x[i] *= alpha
+	}
+}
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * cmplx.Conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex128) (sum complex128) {
+	for i, v := range x {
+		sum += y[i] * cmplx.Conj(v)
+	}
+	return sum
+}
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * cmplx.Conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * cmplx.Conj(x[ix])
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex128) (sum complex128) {
+	for i, v := range x {
+		sum += y[i] * v
+	}
+	return sum
+}
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * x[ix]
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyinc_amd64.s
@@ -0,0 +1,151 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ   x_base+8(FP), SI  // SI = &x
+	MOVQ   y_base+32(FP), DI // DI = &y
+	MOVQ   n+56(FP), CX      // CX = n
+	CMPQ   CX, $0            // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+80(FP), R8     // R8 = ix
+	MOVQ   iy+88(FP), R9     // R9 = iy
+	LEAQ   (SI)(R8*8), SI    // SI = &(x[ix])
+	LEAQ   (DI)(R9*8), DI    // DI = &(y[iy])
+	MOVQ   DI, DX            // DX = DI    // Read/Write pointers
+	MOVQ   incX+64(FP), R8   // R8 = incX
+	SHLQ   $3, R8            // R8 *= sizeof(complex64)
+	MOVQ   incY+72(FP), R9   // R9 = incY
+	SHLQ   $3, R9            // R9 *= sizeof(complex64)
+	MOVSD  alpha+0(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPS $0x11, X1, X1     // X1 = { 0, 0, real(a), imag(a) }
+	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX            // CX = n % 4
+	SHRQ   $2, BX            // BX = floor( n / 4 )
+	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVSD (SI), X3       // X_i = { imag(x[i+1]), real(x[i+1]) }
+	MOVSD (SI)(R8*1), X5
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVSD (SI), X7
+	MOVSD (SI)(R8*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	MOVSD (DX), X2
+	MOVSD (DX)(R9*1), X4
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	MOVSD (DX), X6
+	MOVSD (DX)(R9*1), X8
+	ADDPS X2, X3
+	ADDPS X4, X5
+	ADDPS X6, X7
+	ADDPS X8, X9
+
+	MOVSD X3, (DI)       // y[i] = X_i
+	MOVSD X5, (DI)(R9*1)
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
+	MOVSD X7, (DI)
+	MOVSD X9, (DI)(R9*1)
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
+	DECQ  BX
+	JNZ   axpyi_loop     // }  while --BX > 0
+	CMPQ  CX, $0         // if CX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail: // do {
+	MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) }
+	MOVSHDUP_X3_X2 // X_(i-1) = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3 // X_i = { imag(x[i]), imag(x[i]) }
+
+	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+	MULPS X1, X2
+	MULPS X0, X3
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3 // (ai*x1r+ar*x1i, ar*x1r-ai*x1i)
+
+	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
+	MOVSD (DI), X4
+	ADDPS X4, X3
+	MOVSD X3, (DI)   // y[i] = X_i
+	ADDQ  R8, SI     // SI += incX
+	ADDQ  R9, DI     // DI += incY
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyincto_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyincto_amd64.s
@@ -0,0 +1,156 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ   dst_base+0(FP), DI // DI = &dst
+	MOVQ   x_base+48(FP), SI  // SI = &x
+	MOVQ   y_base+72(FP), DX  // DX = &y
+	MOVQ   n+96(FP), CX       // CX = n
+	CMPQ   CX, $0             // if n==0 { return }
+	JE     axpyi_end
+	MOVQ   ix+120(FP), R8     // Load the first index
+	MOVQ   iy+128(FP), R9
+	MOVQ   idst+32(FP), R10
+	LEAQ   (SI)(R8*8), SI     // SI = &(x[ix])
+	LEAQ   (DX)(R9*8), DX     // DX = &(y[iy])
+	LEAQ   (DI)(R10*8), DI    // DI = &(dst[idst])
+	MOVQ   incX+104(FP), R8   // Incrementors*8 for easy iteration (ADDQ)
+	SHLQ   $3, R8
+	MOVQ   incY+112(FP), R9
+	SHLQ   $3, R9
+	MOVQ   incDst+24(FP), R10
+	SHLQ   $3, R10
+	MOVSD  alpha+40(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	MOVAPS X0, X1
+	SHUFPS $0x11, X1, X1      // X1 = { 0, 0, real(a), imag(a) }
+	MOVAPS X0, X10            // Copy X0 and X1 for pipelining
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $3, CX             // CX = n % 4
+	SHRQ   $2, BX             // BX = floor( n / 4 )
+	JZ     axpyi_tail         // if BX == 0 { goto axpyi_tail }
+
+axpyi_loop: // do {
+	MOVSD (SI), X3       // X_i = { imag(x[i]), real(x[i]) }
+	MOVSD (SI)(R8*1), X5
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVSD (SI), X7
+	MOVSD (SI)(R8*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
+	MOVSD (DX), X2
+	MOVSD (DX)(R9*1), X4
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	MOVSD (DX), X6
+	MOVSD (DX)(R9*1), X8
+	ADDPS X2, X3
+	ADDPS X4, X5
+	ADDPS X6, X7
+	ADDPS X8, X9
+
+	MOVSD X3, (DI)        // y[i] = X_i
+	MOVSD X5, (DI)(R10*1)
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst])
+	MOVSD X7, (DI)
+	MOVSD X9, (DI)(R10*1)
+	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])
+	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst])
+	DECQ  BX
+	JNZ   axpyi_loop      // } while --BX > 0
+	CMPQ  CX, $0          // if CX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail:
+	MOVSD (SI), X3 // X_i     = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3 // X_i     = { real(x[i]), real(x[i]) }
+
+	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
+	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
+	MULPS X1, X2
+	MULPS X0, X3
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//  }
+	ADDSUBPS_X2_X3
+
+	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
+	MOVSD (DX), X4
+	ADDPS X4, X3
+	MOVSD X3, (DI)   // y[i] = X_i
+	ADDQ  R8, SI     // SI += incX
+	ADDQ  R9, DX     // DX += incY
+	ADDQ  R10, DI    // DI += incDst
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitary_amd64.s
@@ -0,0 +1,160 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitary(alpha complex64, x, y []complex64)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+8(FP), SI  // SI = &x
+	MOVQ    y_base+32(FP), DI // DI = &y
+	MOVQ    x_len+16(FP), CX  // CX = min( len(x), len(y) )
+	CMPQ    y_len+40(FP), CX
+	CMOVQLE y_len+40(FP), CX
+	CMPQ    CX, $0            // if CX == 0 { return }
+	JE      caxy_end
+	PXOR    X0, X0            // Clear work registers and cache-align loop
+	PXOR    X1, X1
+	MOVSD   alpha+0(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	SHUFPD  $0, X0, X0        // X0  = { imag(a), real(a), imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPS  $0x11, X1, X1     // X1 = { real(a), imag(a), real(a), imag(a) }
+	XORQ    AX, AX            // i = 0
+	MOVQ    DI, BX            // Align on 16-byte boundary for ADDPS
+	ANDQ    $15, BX           // BX = &y & 15
+	JZ      caxy_no_trim      // if BX == 0 { goto caxy_no_trim }
+
+	// Trim first value in unaligned buffer
+	XORPS X2, X2         // Clear work registers and cache-align loop
+	XORPS X3, X3
+	XORPS X4, X4
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
+	ADDSUBPS_X2_X3
+	MOVSD (DI)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // y[i]  = X3
+	INCQ  AX             // i++
+	DECQ  CX             // --CX
+	JZ    caxy_end       // if CX == 0 { return }
+
+caxy_no_trim:
+	MOVAPS X0, X10   // Copy X0 and X1 for pipelineing
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $7, CX    // CX = n % 8
+	SHRQ   $3, BX    // BX = floor( n / 8 )
+	JZ     caxy_tail // if BX == 0 { goto caxy_tail }
+
+caxy_loop: // do {
+	// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
+	MOVUPS (SI)(AX*8), X3
+	MOVUPS 16(SI)(AX*8), X5
+	MOVUPS 32(SI)(AX*8), X7
+	MOVUPS 48(SI)(AX*8), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i]),
+	// 		imag(a) * real(x[i+1]), real(a) * real(x[i+1])  }
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]),
+	//		real(a) * imag(x[i+1]), imag(a) * imag(x[i+1])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//	imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
+	//	real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i])   + imag(y[i]),   real(result[i])   + real(y[i]),
+	//	   imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1])  }
+	ADDPS  (DI)(AX*8), X3
+	ADDPS  16(DI)(AX*8), X5
+	ADDPS  32(DI)(AX*8), X7
+	ADDPS  48(DI)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i:i+1] = X_i
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX               // --BX
+	JNZ    caxy_loop        // }  while BX > 0
+	CMPQ   CX, $0           // if CX == 0  { return }
+	JE     caxy_end
+
+caxy_tail: // do {
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	  real(a)*real(x[i]) - imag(a)*imag(x[i])   }
+	ADDSUBPS_X2_X3
+	MOVSD (DI)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // y[i]  = X3
+	INCQ  AX             // ++i
+	LOOP  caxy_tail      // } while --CX > 0
+
+caxy_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitaryto_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/axpyunitaryto_amd64.s
@@ -0,0 +1,157 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// MOVSHDUP X3, X2
+#define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
+// MOVSLDUP X3, X3
+#define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
+// ADDSUBPS X2, X3
+#define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
+
+// MOVSHDUP X5, X4
+#define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
+// MOVSLDUP X5, X5
+#define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
+// ADDSUBPS X4, X5
+#define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
+
+// MOVSHDUP X7, X6
+#define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
+// MOVSLDUP X7, X7
+#define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
+// ADDSUBPS X6, X7
+#define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
+
+// MOVSHDUP X9, X8
+#define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
+// MOVSLDUP X9, X9
+#define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
+// ADDSUBPS X8, X9
+#define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
+
+// func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
+TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    x_base+32(FP), SI  // SI = &x
+	MOVQ    y_base+56(FP), DX  // DX = &y
+	MOVQ    x_len+40(FP), CX
+	CMPQ    y_len+64(FP), CX   // CX = min( len(x), len(y), len(dst) )
+	CMOVQLE y_len+64(FP), CX
+	CMPQ    dst_len+8(FP), CX
+	CMOVQLE dst_len+8(FP), CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      caxy_end
+	MOVSD   alpha+24(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
+	SHUFPD  $0, X0, X0         // X0  = { imag(a), real(a), imag(a), real(a) }
+	MOVAPS  X0, X1
+	SHUFPS  $0x11, X1, X1      // X1 = { real(a), imag(a), real(a), imag(a) }
+	XORQ    AX, AX             // i = 0
+	MOVQ    DX, BX             // Align on 16-byte boundary for ADDPS
+	ANDQ    $15, BX            // BX = &y & 15
+	JZ      caxy_no_trim       // if BX == 0 { goto caxy_no_trim }
+
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
+	ADDSUBPS_X2_X3
+	MOVSD (DX)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // dst[i]  = X3
+	INCQ  AX             // i++
+	DECQ  CX             // --CX
+	JZ    caxy_tail      // if BX == 0 { goto caxy_tail }
+
+caxy_no_trim:
+	MOVAPS X0, X10   // Copy X0 and X1 for pipelineing
+	MOVAPS X1, X11
+	MOVQ   CX, BX
+	ANDQ   $7, CX    // CX = n % 8
+	SHRQ   $3, BX    // BX = floor( n / 8 )
+	JZ     caxy_tail // if BX == 0 { goto caxy_tail }
+
+caxy_loop:
+	// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
+	MOVUPS (SI)(AX*8), X3
+	MOVUPS 16(SI)(AX*8), X5
+	MOVUPS 32(SI)(AX*8), X7
+	MOVUPS 48(SI)(AX*8), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i]),
+	// 		imag(a) * real(x[i+1]), real(a) * real(x[i+1])  }
+	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]),
+	//		real(a) * imag(x[i+1]), imag(a) * imag(x[i+1])  }
+	MULPS X1, X2
+	MULPS X0, X3
+	MULPS X11, X4
+	MULPS X10, X5
+	MULPS X1, X6
+	MULPS X0, X7
+	MULPS X11, X8
+	MULPS X10, X9
+
+	// X_i = {
+	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
+	//	imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
+	//	real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// X_i = { imag(result[i])   + imag(y[i]),   real(result[i])   + real(y[i]),
+	//	   imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1])  }
+	ADDPS  (DX)(AX*8), X3
+	ADDPS  16(DX)(AX*8), X5
+	ADDPS  32(DX)(AX*8), X7
+	ADDPS  48(DX)(AX*8), X9
+	MOVUPS X3, (DI)(AX*8)   // y[i:i+1] = X_i
+	MOVUPS X5, 16(DI)(AX*8)
+	MOVUPS X7, 32(DI)(AX*8)
+	MOVUPS X9, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	DECQ   BX               // --BX
+	JNZ    caxy_loop        // }  while BX > 0
+	CMPQ   CX, $0           // if CX == 0  { return }
+	JE     caxy_end
+
+caxy_tail: // do {
+	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
+	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
+	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
+
+	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
+	//	  real(a)*real(x[i]) - imag(a)*imag(x[i])  }
+	ADDSUBPS_X2_X3
+	MOVSD (DX)(AX*8), X4 // X3 += y[i]
+	ADDPS X4, X3
+	MOVSD X3, (DI)(AX*8) // y[i]  = X3
+	INCQ  AX             // ++i
+	LOOP  caxy_tail      // } while --CX > 0
+
+caxy_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/conj.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/conj.go
@@ -0,0 +1,7 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c64
+
+func conj(c complex64) complex64 { return complex(real(c), -imag(c)) }
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/doc.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package c64 provides complex64 vector primitives.
+package c64 // import "gonum.org/v1/gonum/internal/asm/c64"
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcinc_amd64.s
@@ -0,0 +1,160 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSHDUP_X5_X4    LONG $0xE5160FF3 // MOVSHDUP X5, X4
+#define MOVSHDUP_X7_X6    LONG $0xF7160FF3 // MOVSHDUP X7, X6
+#define MOVSHDUP_X9_X8    LONG $0x160F45F3; BYTE $0xC1 // MOVSHDUP X9, X8
+
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+#define MOVSLDUP_X5_X5    LONG $0xED120FF3 // MOVSLDUP X5, X5
+#define MOVSLDUP_X7_X7    LONG $0xFF120FF3 // MOVSLDUP X7, X7
+#define MOVSLDUP_X9_X9    LONG $0x120F45F3; BYTE $0xC9 // MOVSLDUP X9, X9
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
+TEXT ·DotcInc(SB), NOSPLIT, $0
+	MOVQ   x_base+0(FP), X_PTR     // X_PTR = &x
+	MOVQ   y_base+24(FP), Y_PTR    // Y_PTR = &y
+	PXOR   SUM, SUM                // SUM = 0
+	PXOR   P_SUM, P_SUM            // P_SUM = 0
+	MOVQ   n+48(FP), LEN           // LEN = n
+	CMPQ   LEN, $0                 // if LEN == 0 { return }
+	JE     dotc_end
+	MOVQ   ix+72(FP), INC_X
+	MOVQ   iy+80(FP), INC_Y
+	LEAQ   (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[ix])
+	LEAQ   (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(Y_PTR[iy])
+	MOVQ   incX+56(FP), INC_X      // INC_X = incX * sizeof(complex64)
+	SHLQ   $3, INC_X
+	MOVQ   incY+64(FP), INC_Y      // INC_Y = incY * sizeof(complex64)
+	SHLQ   $3, INC_Y
+	MOVSS  $(-1.0), NEG1
+	SHUFPS $0, NEG1, NEG1          // { -1, -1, -1, -1 }
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL  // TAIL = LEN % 4
+	SHRQ $2, LEN   // LEN = floor( LEN / 4 )
+	JZ   dotc_tail // if LEN == 0 { goto dotc_tail }
+
+	MOVUPS NEG1, P_NEG1              // Copy NEG1 for pipelining
+	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ   (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dotc_loop: // do {
+	MOVSD (X_PTR), X3            // X_i = { imag(x[i]), real(x[i]) }
+	MOVSD (X_PTR)(INC_X*1), X5
+	MOVSD (X_PTR)(INC_X*2), X7
+	MOVSD (X_PTR)(INCx3_X*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_(i-1) = { -imag(x[i]), -imag(x[i]) }
+	MULPS NEG1, X2
+	MULPS P_NEG1, X4
+	MULPS NEG1, X6
+	MULPS P_NEG1, X8
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVSD (Y_PTR), X10
+	MOVSD (Y_PTR)(INC_Y*1), X11
+	MOVSD (Y_PTR)(INC_Y*2), X12
+	MOVSD (Y_PTR)(INCx3_Y*1), X13
+
+	// X_i     = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i]) * real(x[i]) + real(y[i]) * imag(x[i]),
+	//	real(result[i]):  real(y[i]) * real(x[i]) - imag(y[i]) * imag(x[i])  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y*4])
+
+	DECQ LEN
+	JNZ  dotc_loop // } while --LEN > 0
+
+	ADDPS P_SUM, SUM // SUM = { P_SUM + SUM }
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dotc_end
+
+dotc_tail: // do {
+	MOVSD  (X_PTR), X3    // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2        // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3        // X_i = { real(x[i]), real(x[i]) }
+	MULPS  NEG1, X2       // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MOVUPS (Y_PTR), X10   // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3        // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2        // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+	ADDPS X3, SUM      // SUM += X_i
+	ADDQ  INC_X, X_PTR // X_PTR += INC_X
+	ADDQ  INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ  TAIL
+	JNZ   dotc_tail    // } while --TAIL > 0
+
+dotc_end:
+	MOVSD SUM, sum+88(FP) // return SUM
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotcunitary_amd64.s
@@ -0,0 +1,208 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSLDUP_XPTR_IDX_8__X3    LONG $0x1C120FF3; BYTE $0xC6 // MOVSLDUP (SI)(AX*8), X3
+#define MOVSLDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF3; WORD $0x10C6 // MOVSLDUP 16(SI)(AX*8), X5
+#define MOVSLDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF3; WORD $0x20C6 // MOVSLDUP 32(SI)(AX*8), X7
+#define MOVSLDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F3; WORD $0xC64C; BYTE $0x30 // MOVSLDUP 48(SI)(AX*8), X9
+
+#define MOVSHDUP_XPTR_IDX_8__X2    LONG $0x14160FF3; BYTE $0xC6 // MOVSHDUP (SI)(AX*8), X2
+#define MOVSHDUP_16_XPTR_IDX_8__X4    LONG $0x64160FF3; WORD $0x10C6 // MOVSHDUP 16(SI)(AX*8), X4
+#define MOVSHDUP_32_XPTR_IDX_8__X6    LONG $0x74160FF3; WORD $0x20C6 // MOVSHDUP 32(SI)(AX*8), X6
+#define MOVSHDUP_48_XPTR_IDX_8__X8    LONG $0x160F44F3; WORD $0xC644; BYTE $0x30 // MOVSHDUP 48(SI)(AX*8), X8
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotcUnitary(x, y []complex64) (sum complex64)
+TEXT ·DotcUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR    SUM, SUM             // SUM = 0
+	PXOR    P_SUM, P_SUM         // P_SUM = 0
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dotc_end
+	XORQ    IDX, IDX             // i = 0
+	MOVSS   $(-1.0), NEG1
+	SHUFPS  $0, NEG1, NEG1       // { -1, -1, -1, -1 }
+
+	MOVQ X_PTR, DX
+	ANDQ $15, DX      // DX = &x & 15
+	JZ   dotc_aligned // if DX == 0 { goto dotc_aligned }
+
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i     = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i     = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j     = { imag(y[i]), real(y[i]) }
+	MULPS  NEG1, X2            // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MULPS  X10, X3             // X_i     = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j     = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	MOVAPS X3, SUM  // SUM = X_i
+	INCQ   IDX      // IDX++
+	DECQ   LEN      // LEN--
+	JZ     dotc_ret // if LEN == 0 { goto dotc_ret }
+
+dotc_aligned:
+	MOVQ   LEN, TAIL
+	ANDQ   $7, TAIL     // TAIL = LEN % 8
+	SHRQ   $3, LEN      // LEN = floor( LEN / 8 )
+	JZ     dotc_tail    // if LEN == 0 { return }
+	MOVUPS NEG1, P_NEG1 // Copy NEG1 for pipelining
+
+dotc_loop: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_16_XPTR_IDX_8__X5
+	MOVSLDUP_32_XPTR_IDX_8__X7
+	MOVSLDUP_48_XPTR_IDX_8__X9
+
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i+1]), imag(x[i+1]) }
+	MOVSHDUP_16_XPTR_IDX_8__X4
+	MOVSHDUP_32_XPTR_IDX_8__X6
+	MOVSHDUP_48_XPTR_IDX_8__X8
+
+	// X_j = { imag(y[i]), real(y[i]), imag(y[i+1]), real(y[i+1]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_(i-1) = { -imag(x[i]), -imag(x[i]), -imag(x[i]+1), -imag(x[i]+1) }
+	MULPS NEG1, X2
+	MULPS P_NEG1, X4
+	MULPS NEG1, X6
+	MULPS P_NEG1, X8
+
+	// X_i     = {  imag(y[i])   * real(x[i]),   real(y[i])   * real(x[i]),
+	// 		imag(y[i+1]) * real(x[i+1]), real(y[i+1]) * real(x[i+1])  }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]), real(y[i+1]), imag(y[i+1]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = {  real(y[i])   * imag(x[i]),   imag(y[i])   * imag(x[i]),
+	//		real(y[i+1]) * imag(x[i+1]), imag(y[i+1]) * imag(x[i+1])  }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):   imag(y[i])   * real(x[i])   + real(y[i])   * imag(x[i]),
+	//	real(result[i]):   real(y[i])   * real(x[i])   - imag(y[i])   * imag(x[i]),
+	//	imag(result[i+1]): imag(y[i+1]) * real(x[i+1]) + real(y[i+1]) * imag(x[i+1]),
+	//	real(result[i+1]): real(y[i+1]) * real(x[i+1]) - imag(y[i+1]) * imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	ADDQ $8, IDX   // IDX += 8
+	DECQ LEN
+	JNZ  dotc_loop // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	CMPQ TAIL, $0 // if TAIL == 0 { return }
+	JE   dotc_end
+
+dotc_tail:
+	MOVQ TAIL, LEN
+	SHRQ $1, LEN       // LEN = floor( LEN / 2 )
+	JZ   dotc_tail_one // if LEN == 0 { goto dotc_tail_one }
+
+dotc_tail_two: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  NEG1, X2            // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0xB1, X10, X10     // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+	ADDQ $2, IDX       // IDX += 2
+	DECQ LEN
+	JNZ  dotc_tail_two // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	ANDQ $1, TAIL
+	JZ   dotc_end
+
+dotc_tail_one:
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  NEG1, X2            // X_(i-1) = { -imag(x[i]), imag(x[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+dotc_end:
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[0] + SUM[0] }
+	MOVHLPS P_SUM, P_SUM // P_SUM = { P_SUM[1], P_SUM[1] }
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[1] + SUM[0] }
+
+dotc_ret:
+	MOVSD SUM, sum+48(FP) // return SUM
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuinc_amd64.s
@@ -0,0 +1,148 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSHDUP_X5_X4    LONG $0xE5160FF3 // MOVSHDUP X5, X4
+#define MOVSHDUP_X7_X6    LONG $0xF7160FF3 // MOVSHDUP X7, X6
+#define MOVSHDUP_X9_X8    LONG $0x160F45F3; BYTE $0xC1 // MOVSHDUP X9, X8
+
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+#define MOVSLDUP_X5_X5    LONG $0xED120FF3 // MOVSLDUP X5, X5
+#define MOVSLDUP_X7_X7    LONG $0xFF120FF3 // MOVSLDUP X7, X7
+#define MOVSLDUP_X9_X9    LONG $0x120F45F3; BYTE $0xC9 // MOVSLDUP X9, X9
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define INC_X R8
+#define INCx3_X R9
+#define INC_Y R10
+#define INCx3_Y R11
+
+// func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
+TEXT ·DotuInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR     // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR    // Y_PTR = &y
+	PXOR SUM, SUM                // SUM = 0
+	PXOR P_SUM, P_SUM            // P_SUM = 0
+	MOVQ n+48(FP), LEN           // LEN = n
+	CMPQ LEN, $0                 // if LEN == 0 { return }
+	JE   dotu_end
+	MOVQ ix+72(FP), INC_X
+	MOVQ iy+80(FP), INC_Y
+	LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(X_PTR[ix])
+	LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(Y_PTR[iy])
+	MOVQ incX+56(FP), INC_X      // INC_X = incX * sizeof(complex64)
+	SHLQ $3, INC_X
+	MOVQ incY+64(FP), INC_Y      // INC_Y = incY * sizeof(complex64)
+	SHLQ $3, INC_Y
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL  // TAIL = LEN % 4
+	SHRQ $2, LEN   // LEN = floor( LEN / 4 )
+	JZ   dotu_tail // if TAIL == 0 { goto dotu_tail }
+
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dotu_loop: // do {
+	MOVSD (X_PTR), X3            // X_i = { imag(x[i]), real(x[i]) }
+	MOVSD (X_PTR)(INC_X*1), X5
+	MOVSD (X_PTR)(INC_X*2), X7
+	MOVSD (X_PTR)(INCx3_X*1), X9
+
+	// X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSHDUP_X3_X2
+	MOVSHDUP_X5_X4
+	MOVSHDUP_X7_X6
+	MOVSHDUP_X9_X8
+
+	// X_i = { real(x[i]), real(x[i]) }
+	MOVSLDUP_X3_X3
+	MOVSLDUP_X5_X5
+	MOVSLDUP_X7_X7
+	MOVSLDUP_X9_X9
+
+	// X_j = { imag(y[i]), real(y[i]) }
+	MOVSD (Y_PTR), X10
+	MOVSD (Y_PTR)(INC_Y*1), X11
+	MOVSD (Y_PTR)(INC_Y*2), X12
+	MOVSD (Y_PTR)(INCx3_Y*1), X13
+
+	// X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i]) * real(x[i]) + real(y[i]) * imag(x[i]),
+	//	real(result[i]):  real(y[i]) * real(x[i]) - imag(y[i]) * imag(x[i])  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y*4])
+
+	DECQ LEN
+	JNZ  dotu_loop // } while --LEN > 0
+
+	ADDPS P_SUM, SUM // SUM = { P_SUM + SUM }
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dotu_end
+
+dotu_tail: // do {
+	MOVSD  (X_PTR), X3    // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2        // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3        // X_i = { real(x[i]), real(x[i]) }
+	MOVUPS (Y_PTR), X10   // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3        // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10 // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2        // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i])  }
+	ADDSUBPS_X2_X3
+	ADDPS X3, SUM      // SUM += X_i
+	ADDQ  INC_X, X_PTR // X_PTR += INC_X
+	ADDQ  INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ  TAIL
+	JNZ   dotu_tail    // } while --TAIL > 0
+
+dotu_end:
+	MOVSD SUM, sum+88(FP) // return SUM
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/dotuunitary_amd64.s
@@ -0,0 +1,197 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define MOVSLDUP_XPTR_IDX_8__X3    LONG $0x1C120FF3; BYTE $0xC6 // MOVSLDUP (SI)(AX*8), X3
+#define MOVSLDUP_16_XPTR_IDX_8__X5    LONG $0x6C120FF3; WORD $0x10C6 // MOVSLDUP 16(SI)(AX*8), X5
+#define MOVSLDUP_32_XPTR_IDX_8__X7    LONG $0x7C120FF3; WORD $0x20C6 // MOVSLDUP 32(SI)(AX*8), X7
+#define MOVSLDUP_48_XPTR_IDX_8__X9    LONG $0x120F44F3; WORD $0xC64C; BYTE $0x30 // MOVSLDUP 48(SI)(AX*8), X9
+
+#define MOVSHDUP_XPTR_IDX_8__X2    LONG $0x14160FF3; BYTE $0xC6 // MOVSHDUP (SI)(AX*8), X2
+#define MOVSHDUP_16_XPTR_IDX_8__X4    LONG $0x64160FF3; WORD $0x10C6 // MOVSHDUP 16(SI)(AX*8), X4
+#define MOVSHDUP_32_XPTR_IDX_8__X6    LONG $0x74160FF3; WORD $0x20C6 // MOVSHDUP 32(SI)(AX*8), X6
+#define MOVSHDUP_48_XPTR_IDX_8__X8    LONG $0x160F44F3; WORD $0xC644; BYTE $0x30 // MOVSHDUP 48(SI)(AX*8), X8
+
+#define MOVSHDUP_X3_X2    LONG $0xD3160FF3 // MOVSHDUP X3, X2
+#define MOVSLDUP_X3_X3    LONG $0xDB120FF3 // MOVSLDUP X3, X3
+
+#define ADDSUBPS_X2_X3    LONG $0xDAD00FF2 // ADDSUBPS X2, X3
+#define ADDSUBPS_X4_X5    LONG $0xECD00FF2 // ADDSUBPS X4, X5
+#define ADDSUBPS_X6_X7    LONG $0xFED00FF2 // ADDSUBPS X6, X7
+#define ADDSUBPS_X8_X9    LONG $0xD00F45F2; BYTE $0xC8 // ADDSUBPS X8, X9
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define P_SUM X1
+#define IDX AX
+#define I_IDX DX
+#define NEG1 X15
+#define P_NEG1 X14
+
+// func DotuUnitary(x, y []complex64) (sum complex64)
+TEXT ·DotuUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR    SUM, SUM             // SUM = 0
+	PXOR    P_SUM, P_SUM         // P_SUM = 0
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      dotu_end
+	XORQ    IDX, IDX             // IDX = 0
+
+	MOVQ X_PTR, DX
+	ANDQ $15, DX      // DX = &x & 15
+	JZ   dotu_aligned // if DX == 0 { goto dotu_aligned }
+
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i     = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i     = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j     = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3             // X_i     = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j     = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	MOVAPS X3, SUM  // SUM = X_i
+	INCQ   IDX      // IDX++
+	DECQ   LEN      // LEN--
+	JZ     dotu_end // if LEN == 0 { goto dotu_end }
+
+dotu_aligned:
+	MOVQ LEN, TAIL
+	ANDQ $7, TAIL     // TAIL = LEN % 8
+	SHRQ $3, LEN      // LEN = floor( LEN / 8 )
+	JZ   dotu_tail    // if LEN == 0 { goto dotu_tail }
+	PXOR P_SUM, P_SUM
+
+dotu_loop: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSLDUP_16_XPTR_IDX_8__X5
+	MOVSLDUP_32_XPTR_IDX_8__X7
+	MOVSLDUP_48_XPTR_IDX_8__X9
+
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVSHDUP_16_XPTR_IDX_8__X4
+	MOVSHDUP_32_XPTR_IDX_8__X6
+	MOVSHDUP_48_XPTR_IDX_8__X8
+
+	// X_j = { imag(y[i]), real(y[i]), imag(y[i+1]), real(y[i+1]) }
+	MOVUPS (Y_PTR)(IDX*8), X10
+	MOVUPS 16(Y_PTR)(IDX*8), X11
+	MOVUPS 32(Y_PTR)(IDX*8), X12
+	MOVUPS 48(Y_PTR)(IDX*8), X13
+
+	// X_i     = {  imag(y[i])   * real(x[i]),   real(y[i])   * real(x[i]),
+	// 		imag(y[i+1]) * real(x[i+1]), real(y[i+1]) * real(x[i+1])  }
+	MULPS X10, X3
+	MULPS X11, X5
+	MULPS X12, X7
+	MULPS X13, X9
+
+	// X_j = { real(y[i]), imag(y[i]), real(y[i+1]), imag(y[i+1]) }
+	SHUFPS $0xB1, X10, X10
+	SHUFPS $0xB1, X11, X11
+	SHUFPS $0xB1, X12, X12
+	SHUFPS $0xB1, X13, X13
+
+	// X_(i-1) = {  real(y[i])   * imag(x[i]),   imag(y[i])   * imag(x[i]),
+	//		real(y[i+1]) * imag(x[i+1]), imag(y[i+1]) * imag(x[i+1])  }
+	MULPS X10, X2
+	MULPS X11, X4
+	MULPS X12, X6
+	MULPS X13, X8
+
+	// X_i = {
+	//	imag(result[i]):   imag(y[i])   * real(x[i])   + real(y[i])   * imag(x[i]),
+	//	real(result[i]):   real(y[i])   * real(x[i])   - imag(y[i])   * imag(x[i]),
+	//	imag(result[i+1]): imag(y[i+1]) * real(x[i+1]) + real(y[i+1]) * imag(x[i+1]),
+	//	real(result[i+1]): real(y[i+1]) * real(x[i+1]) - imag(y[i+1]) * imag(x[i+1]),
+	//  }
+	ADDSUBPS_X2_X3
+	ADDSUBPS_X4_X5
+	ADDSUBPS_X6_X7
+	ADDSUBPS_X8_X9
+
+	// SUM += X_i
+	ADDPS X3, SUM
+	ADDPS X5, P_SUM
+	ADDPS X7, SUM
+	ADDPS X9, P_SUM
+
+	ADDQ $8, IDX   // IDX += 8
+	DECQ LEN
+	JNZ  dotu_loop // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	CMPQ TAIL, $0 // if TAIL == 0 { return }
+	JE   dotu_end
+
+dotu_tail:
+	MOVQ TAIL, LEN
+	SHRQ $1, LEN       // LEN = floor( LEN / 2 )
+	JZ   dotu_tail_one // if LEN == 0 { goto dotc_tail_one }
+
+dotu_tail_two: // do {
+	MOVSLDUP_XPTR_IDX_8__X3    // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
+	MOVSHDUP_XPTR_IDX_8__X2    // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
+	MOVUPS (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0xB1, X10, X10     // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+	ADDQ $2, IDX       // IDX += 2
+	DECQ LEN
+	JNZ  dotu_tail_two // } while --LEN > 0
+
+	ADDPS SUM, P_SUM // P_SUM = { P_SUM[1] + SUM[1], P_SUM[0] + SUM[0] }
+	XORPS SUM, SUM   // SUM = 0
+
+	ANDQ $1, TAIL
+	JZ   dotu_end
+
+dotu_tail_one:
+	MOVSD  (X_PTR)(IDX*8), X3  // X_i = { imag(x[i]), real(x[i]) }
+	MOVSHDUP_X3_X2             // X_(i-1) = { imag(x[i]), imag(x[i]) }
+	MOVSLDUP_X3_X3             // X_i = { real(x[i]), real(x[i]) }
+	MOVSD  (Y_PTR)(IDX*8), X10 // X_j = { imag(y[i]), real(y[i]) }
+	MULPS  X10, X3             // X_i = { imag(y[i]) * real(x[i]), real(y[i]) * real(x[i]) }
+	SHUFPS $0x1, X10, X10      // X_j = { real(y[i]), imag(y[i]) }
+	MULPS  X10, X2             // X_(i-1) = { real(y[i]) * imag(x[i]), imag(y[i]) * imag(x[i]) }
+
+	// X_i = {
+	//	imag(result[i]):  imag(y[i])*real(x[i]) + real(y[i])*imag(x[i]),
+	//	real(result[i]):  real(y[i])*real(x[i]) - imag(y[i])*imag(x[i]) }
+	ADDSUBPS_X2_X3
+
+	ADDPS X3, SUM // SUM += X_i
+
+dotu_end:
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[0] + SUM[0] }
+	MOVHLPS P_SUM, P_SUM // P_SUM = { P_SUM[1], P_SUM[1] }
+	ADDPS   P_SUM, SUM   // SUM = { P_SUM[1] + SUM[0] }
+
+dotu_ret:
+	MOVSD SUM, sum+48(FP) // return SUM
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/scal.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/scal.go
@@ -0,0 +1,85 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c64
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha complex64, x []complex64) {
+	for i := range x {
+		x[i] *= alpha
+	}
+}
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []complex64, alpha complex64, x []complex64) {
+	for i, v := range x {
+		dst[i] = alpha * v
+	}
+}
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha complex64, x []complex64, n, incX uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] *= alpha
+		ix += incX
+	}
+}
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []complex64, incDst uintptr, alpha complex64, x []complex64, n, incX uintptr) {
+	var idst, ix uintptr
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha * x[ix]
+		ix += incX
+		idst += incDst
+	}
+}
+
+// SscalUnitary is
+//
+//	for i, v := range x {
+//		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+//	}
+func SscalUnitary(alpha float32, x []complex64) {
+	for i, v := range x {
+		x[i] = complex(real(v)*alpha, imag(v)*alpha)
+	}
+}
+
+// SscalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+//		ix += inc
+//	}
+func SscalInc(alpha float32, x []complex64, n, inc uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] = complex(real(x[ix])*alpha, imag(x[ix])*alpha)
+		ix += inc
+	}
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs.go
@@ -0,0 +1,180 @@
+// Copyright ©2020 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package c64
+
+import (
+	"gonum.org/v1/gonum/internal/cmplx64"
+	"gonum.org/v1/gonum/internal/math32"
+)
+
+// Add is
+//
+//	for i, v := range s {
+//		dst[i] += v
+//	}
+func Add(dst, s []complex64) {
+	for i, v := range s {
+		dst[i] += v
+	}
+}
+
+// AddConst is
+//
+//	for i := range x {
+//		x[i] += alpha
+//	}
+func AddConst(alpha complex64, x []complex64) {
+	for i := range x {
+		x[i] += alpha
+	}
+}
+
+// CumSum is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] + v
+//	}
+//	return dst
+func CumSum(dst, s []complex64) []complex64 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] + v
+	}
+	return dst
+}
+
+// CumProd is
+//
+//	if len(s) == 0 {
+//		return dst
+//	}
+//	dst[0] = s[0]
+//	for i, v := range s[1:] {
+//		dst[i+1] = dst[i] * v
+//	}
+//	return dst
+func CumProd(dst, s []complex64) []complex64 {
+	if len(s) == 0 {
+		return dst
+	}
+	dst[0] = s[0]
+	for i, v := range s[1:] {
+		dst[i+1] = dst[i] * v
+	}
+	return dst
+}
+
+// Div is
+//
+//	for i, v := range s {
+//		dst[i] /= v
+//	}
+func Div(dst, s []complex64) {
+	for i, v := range s {
+		dst[i] /= v
+	}
+}
+
+// DivTo is
+//
+//	for i, v := range s {
+//		dst[i] = v / t[i]
+//	}
+//	return dst
+func DivTo(dst, s, t []complex64) []complex64 {
+	for i, v := range s {
+		dst[i] = v / t[i]
+	}
+	return dst
+}
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += conj(v) * y[i]
+//	}
+//	return sum
+func DotUnitary(x, y []complex64) (sum complex64) {
+	for i, v := range x {
+		sum += cmplx64.Conj(v) * y[i]
+	}
+	return sum
+}
+
+// L2DistanceUnitary returns the L2-norm of x-y.
+func L2DistanceUnitary(x, y []complex64) (norm float32) {
+	var scale float32
+	sumSquares := float32(1.0)
+	for i, v := range x {
+		v -= y[i]
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx64.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// L2NormUnitary returns the L2-norm of x.
+func L2NormUnitary(x []complex64) (norm float32) {
+	var scale float32
+	sumSquares := float32(1.0)
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := cmplx64.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// Sum is
+//
+//	var sum complex64
+//	for i := range x {
+//	    sum += x[i]
+//	}
+func Sum(x []complex64) complex64 {
+	var sum complex64
+	for _, v := range x {
+		sum += v
+	}
+	return sum
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_amd64.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_amd64.go
@@ -0,0 +1,77 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package c64
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex64, x, y []complex64)
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex64) (sum complex64)
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex64) (sum complex64)
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64)
--- a/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_noasm.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/c64/stubs_noasm.go
@@ -0,0 +1,122 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package c64
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha complex64, x, y []complex64) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
+
+// DotcUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * conj(v)
+//	}
+//	return sum
+func DotcUnitary(x, y []complex64) (sum complex64) {
+	for i, v := range x {
+		sum += y[i] * conj(v)
+	}
+	return sum
+}
+
+// DotcInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * conj(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * conj(x[ix])
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
+
+// DotuUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotuUnitary(x, y []complex64) (sum complex64) {
+	for i, v := range x {
+		sum += y[i] * v
+	}
+	return sum
+}
+
+// DotuInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * x[ix]
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyinc_amd64.s
@@ -0,0 +1,73 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ  n+56(FP), CX      // CX = n
+	CMPQ  CX, $0            // if n==0 { return }
+	JLE   axpyi_end
+	MOVQ  x_base+8(FP), SI  // SI = &x
+	MOVQ  y_base+32(FP), DI // DI = &y
+	MOVQ  ix+80(FP), R8     // R8 = ix
+	MOVQ  iy+88(FP), R9     // R9 = iy
+	LEAQ  (SI)(R8*4), SI    // SI = &(x[ix])
+	LEAQ  (DI)(R9*4), DI    // DI = &(y[iy])
+	MOVQ  DI, DX            // DX = DI   Read Pointer for y
+	MOVQ  incX+64(FP), R8   // R8 = incX
+	SHLQ  $2, R8            // R8 *= sizeof(float32)
+	MOVQ  incY+72(FP), R9   // R9 = incY
+	SHLQ  $2, R9            // R9 *= sizeof(float32)
+	MOVSS alpha+0(FP), X0   // X0 = alpha
+	MOVSS X0, X1            // X1 = X0  // for pipelining
+	MOVQ  CX, BX
+	ANDQ  $3, BX            // BX = n % 4
+	SHRQ  $2, CX            // CX = floor( n / 4 )
+	JZ    axpyi_tail_start  // if CX == 0 { goto axpyi_tail_start }
+
+axpyi_loop: // Loop unrolled 4x   do {
+	MOVSS (SI), X2       // X_i = x[i]
+	MOVSS (SI)(R8*1), X3
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
+	MOVSS (SI), X4
+	MOVSS (SI)(R8*1), X5
+	MULSS X1, X2         // X_i *= a
+	MULSS X0, X3
+	MULSS X1, X4
+	MULSS X0, X5
+	ADDSS (DX), X2       // X_i += y[i]
+	ADDSS (DX)(R9*1), X3
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	ADDSS (DX), X4
+	ADDSS (DX)(R9*1), X5
+	MOVSS X2, (DI)       // y[i] = X_i
+	MOVSS X3, (DI)(R9*1)
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incY*2])
+	MOVSS X4, (DI)
+	MOVSS X5, (DI)(R9*1)
+	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])  // Increment addresses
+	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
+	LEAQ  (DI)(R9*2), DI // DI = &(DI[incY*2])
+	LOOP  axpyi_loop     // } while --CX > 0
+	CMPQ  BX, $0         // if BX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+axpyi_tail: // do {
+	MOVSS (SI), X2   // X2 = x[i]
+	MULSS X1, X2     // X2 *= a
+	ADDSS (DI), X2   // X2 += y[i]
+	MOVSS X2, (DI)   // y[i] = X2
+	ADDQ  R8, SI     // SI = &(SI[incX])
+	ADDQ  R9, DI     // DI = &(DI[incY])
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
+
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyincto_amd64.s
@@ -0,0 +1,78 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ  n+96(FP), CX       // CX = n
+	CMPQ  CX, $0             // if n==0 { return }
+	JLE   axpyi_end
+	MOVQ  dst_base+0(FP), DI // DI = &dst
+	MOVQ  x_base+48(FP), SI  // SI = &x
+	MOVQ  y_base+72(FP), DX  // DX = &y
+	MOVQ  ix+120(FP), R8     // R8 = ix  // Load the first index
+	MOVQ  iy+128(FP), R9     // R9 = iy
+	MOVQ  idst+32(FP), R10   // R10 = idst
+	LEAQ  (SI)(R8*4), SI     // SI = &(x[ix])
+	LEAQ  (DX)(R9*4), DX     // DX = &(y[iy])
+	LEAQ  (DI)(R10*4), DI    // DI = &(dst[idst])
+	MOVQ  incX+104(FP), R8   // R8 = incX
+	SHLQ  $2, R8             // R8 *= sizeof(float32)
+	MOVQ  incY+112(FP), R9   // R9 = incY
+	SHLQ  $2, R9             // R9 *= sizeof(float32)
+	MOVQ  incDst+24(FP), R10 // R10 = incDst
+	SHLQ  $2, R10            // R10 *= sizeof(float32)
+	MOVSS alpha+40(FP), X0   // X0 = alpha
+	MOVSS X0, X1             // X1 = X0  // for pipelining
+	MOVQ  CX, BX
+	ANDQ  $3, BX             // BX = n % 4
+	SHRQ  $2, CX             // CX = floor( n / 4 )
+	JZ    axpyi_tail_start   // if CX == 0 { goto axpyi_tail_start }
+
+axpyi_loop: // Loop unrolled 4x   do {
+	MOVSS (SI), X2        // X_i = x[i]
+	MOVSS (SI)(R8*1), X3
+	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])
+	MOVSS (SI), X4
+	MOVSS (SI)(R8*1), X5
+	MULSS X1, X2          // X_i *= a
+	MULSS X0, X3
+	MULSS X1, X4
+	MULSS X0, X5
+	ADDSS (DX), X2        // X_i += y[i]
+	ADDSS (DX)(R9*1), X3
+	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	ADDSS (DX), X4
+	ADDSS (DX)(R9*1), X5
+	MOVSS X2, (DI)        // dst[i] = X_i
+	MOVSS X3, (DI)(R10*1)
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst*2])
+	MOVSS X4, (DI)
+	MOVSS X5, (DI)(R10*1)
+	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])  // Increment addresses
+	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
+	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst*2])
+	LOOP  axpyi_loop      // } while --CX > 0
+	CMPQ  BX, $0          // if BX == 0 { return }
+	JE    axpyi_end
+
+axpyi_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+axpyi_tail: // do {
+	MOVSS (SI), X2   // X2 = x[i]
+	MULSS X1, X2     // X2 *= a
+	ADDSS (DX), X2   // X2 += y[i]
+	MOVSS X2, (DI)   // dst[i] = X2
+	ADDQ  R8, SI     // SI = &(SI[incX])
+	ADDQ  R9, DX     // DX = &(DX[incY])
+	ADDQ  R10, DI    // DI = &(DI[incY])
+	LOOP  axpyi_tail // } while --CX > 0
+
+axpyi_end:
+	RET
+
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitary_amd64.s
@@ -0,0 +1,97 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyUnitary(alpha float32, x, y []float32)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+8(FP), SI  // SI = &x
+	MOVQ    y_base+32(FP), DI // DI = &y
+	MOVQ    x_len+16(FP), BX  // BX = min( len(x), len(y) )
+	CMPQ    y_len+40(FP), BX
+	CMOVQLE y_len+40(FP), BX
+	CMPQ    BX, $0            // if BX == 0 { return }
+	JE      axpy_end
+	MOVSS   alpha+0(FP), X0
+	SHUFPS  $0, X0, X0        // X0 = { a, a, a, a }
+	XORQ    AX, AX            // i = 0
+	PXOR    X2, X2            // 2 NOP instructions (PXOR) to align
+	PXOR    X3, X3            // loop to cache line
+	MOVQ    DI, CX
+	ANDQ    $0xF, CX          // Align on 16-byte boundary for ADDPS
+	JZ      axpy_no_trim      // if CX == 0 { goto axpy_no_trim }
+
+	XORQ $0xF, CX // CX = 4 - floor( BX % 16 / 4 )
+	INCQ CX
+	SHRQ $2, CX
+
+axpy_align: // Trim first value(s) in unaligned buffer  do {
+	MOVSS (SI)(AX*4), X2 // X2 = x[i]
+	MULSS X0, X2         // X2 *= a
+	ADDSS (DI)(AX*4), X2 // X2 += y[i]
+	MOVSS X2, (DI)(AX*4) // y[i] = X2
+	INCQ  AX             // i++
+	DECQ  BX
+	JZ    axpy_end       // if --BX == 0 { return }
+	LOOP  axpy_align     // } while --CX > 0
+
+axpy_no_trim:
+	MOVUPS X0, X1           // Copy X0 to X1 for pipelining
+	MOVQ   BX, CX
+	ANDQ   $0xF, BX         // BX = len % 16
+	SHRQ   $4, CX           // CX = int( len / 16 )
+	JZ     axpy_tail4_start // if CX == 0 { return }
+
+axpy_loop: // Loop unrolled 16x   do {
+	MOVUPS (SI)(AX*4), X2   // X2 = x[i:i+4]
+	MOVUPS 16(SI)(AX*4), X3
+	MOVUPS 32(SI)(AX*4), X4
+	MOVUPS 48(SI)(AX*4), X5
+	MULPS  X0, X2           // X2 *= a
+	MULPS  X1, X3
+	MULPS  X0, X4
+	MULPS  X1, X5
+	ADDPS  (DI)(AX*4), X2   // X2 += y[i:i+4]
+	ADDPS  16(DI)(AX*4), X3
+	ADDPS  32(DI)(AX*4), X4
+	ADDPS  48(DI)(AX*4), X5
+	MOVUPS X2, (DI)(AX*4)   // dst[i:i+4] = X2
+	MOVUPS X3, 16(DI)(AX*4)
+	MOVUPS X4, 32(DI)(AX*4)
+	MOVUPS X5, 48(DI)(AX*4)
+	ADDQ   $16, AX          // i += 16
+	LOOP   axpy_loop        // while (--CX) > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     axpy_end
+
+axpy_tail4_start: // Reset loop counter for 4-wide tail loop
+	MOVQ BX, CX          // CX = floor( BX / 4 )
+	SHRQ $2, CX
+	JZ   axpy_tail_start // if CX == 0 { goto axpy_tail_start }
+
+axpy_tail4: // Loop unrolled 4x   do {
+	MOVUPS (SI)(AX*4), X2 // X2 = x[i]
+	MULPS  X0, X2         // X2 *= a
+	ADDPS  (DI)(AX*4), X2 // X2 += y[i]
+	MOVUPS X2, (DI)(AX*4) // y[i] = X2
+	ADDQ   $4, AX         // i += 4
+	LOOP   axpy_tail4     // } while --CX > 0
+
+axpy_tail_start: // Reset loop counter for 1-wide tail loop
+	MOVQ BX, CX   // CX = BX % 4
+	ANDQ $3, CX
+	JZ   axpy_end // if CX == 0 { return }
+
+axpy_tail:
+	MOVSS (SI)(AX*4), X1 // X1 = x[i]
+	MULSS X0, X1         // X1 *= a
+	ADDSS (DI)(AX*4), X1 // X1 += y[i]
+	MOVSS X1, (DI)(AX*4) // y[i] = X1
+	INCQ  AX             // i++
+	LOOP  axpy_tail      // } while --CX > 0
+
+axpy_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/axpyunitaryto_amd64.s
@@ -0,0 +1,98 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
+TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    x_base+32(FP), SI  // SI = &x
+	MOVQ    y_base+56(FP), DX  // DX = &y
+	MOVQ    x_len+40(FP), BX   // BX = min( len(x), len(y), len(dst) )
+	CMPQ    y_len+64(FP), BX
+	CMOVQLE y_len+64(FP), BX
+	CMPQ    dst_len+8(FP), BX
+	CMOVQLE dst_len+8(FP), BX
+	CMPQ    BX, $0             // if BX == 0 { return }
+	JE      axpy_end
+	MOVSS   alpha+24(FP), X0
+	SHUFPS  $0, X0, X0         // X0 = { a, a, a, a, }
+	XORQ    AX, AX             // i = 0
+	MOVQ    DX, CX
+	ANDQ    $0xF, CX           // Align on 16-byte boundary for ADDPS
+	JZ      axpy_no_trim       // if CX == 0 { goto axpy_no_trim }
+
+	XORQ $0xF, CX // CX = 4 - floor ( B % 16 / 4 )
+	INCQ CX
+	SHRQ $2, CX
+
+axpy_align: // Trim first value(s) in unaligned buffer  do {
+	MOVSS (SI)(AX*4), X2 // X2 = x[i]
+	MULSS X0, X2         // X2 *= a
+	ADDSS (DX)(AX*4), X2 // X2 += y[i]
+	MOVSS X2, (DI)(AX*4) // y[i] = X2
+	INCQ  AX             // i++
+	DECQ  BX
+	JZ    axpy_end       // if --BX == 0 { return }
+	LOOP  axpy_align     // } while --CX > 0
+
+axpy_no_trim:
+	MOVUPS X0, X1           // Copy X0 to X1 for pipelining
+	MOVQ   BX, CX
+	ANDQ   $0xF, BX         // BX = len % 16
+	SHRQ   $4, CX           // CX = floor( len / 16 )
+	JZ     axpy_tail4_start // if CX == 0 { return }
+
+axpy_loop: // Loop unrolled 16x  do {
+	MOVUPS (SI)(AX*4), X2   // X2 = x[i:i+4]
+	MOVUPS 16(SI)(AX*4), X3
+	MOVUPS 32(SI)(AX*4), X4
+	MOVUPS 48(SI)(AX*4), X5
+	MULPS  X0, X2           // X2 *= a
+	MULPS  X1, X3
+	MULPS  X0, X4
+	MULPS  X1, X5
+	ADDPS  (DX)(AX*4), X2   // X2 += y[i:i+4]
+	ADDPS  16(DX)(AX*4), X3
+	ADDPS  32(DX)(AX*4), X4
+	ADDPS  48(DX)(AX*4), X5
+	MOVUPS X2, (DI)(AX*4)   // dst[i:i+4] = X2
+	MOVUPS X3, 16(DI)(AX*4)
+	MOVUPS X4, 32(DI)(AX*4)
+	MOVUPS X5, 48(DI)(AX*4)
+	ADDQ   $16, AX          // i += 16
+	LOOP   axpy_loop        // while (--CX) > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     axpy_end
+
+axpy_tail4_start: // Reset loop counter for 4-wide tail loop
+	MOVQ BX, CX          // CX = floor( BX / 4 )
+	SHRQ $2, CX
+	JZ   axpy_tail_start // if CX == 0 { goto axpy_tail_start }
+
+axpy_tail4: // Loop unrolled 4x  do {
+	MOVUPS (SI)(AX*4), X2 // X2 = x[i]
+	MULPS  X0, X2         // X2 *= a
+	ADDPS  (DX)(AX*4), X2 // X2 += y[i]
+	MOVUPS X2, (DI)(AX*4) // y[i] = X2
+	ADDQ   $4, AX         // i += 4
+	LOOP   axpy_tail4     // } while --CX > 0
+
+axpy_tail_start: // Reset loop counter for 1-wide tail loop
+	MOVQ BX, CX   // CX = BX % 4
+	ANDQ $3, CX
+	JZ   axpy_end // if CX == 0 { return }
+
+axpy_tail:
+	MOVSS (SI)(AX*4), X1 // X1 = x[i]
+	MULSS X0, X1         // X1 *= a
+	ADDSS (DX)(AX*4), X1 // X1 += y[i]
+	MOVSS X1, (DI)(AX*4) // y[i] = X1
+	INCQ  AX             // i++
+	LOOP  axpy_tail      // } while --CX > 0
+
+axpy_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotinc_amd64.s
@@ -0,0 +1,91 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R10
+#define INC_Y R9
+#define INCx3_Y R11
+#define SUM X0
+#define P_SUM X1
+
+// func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
+TEXT ·DdotInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ n+48(FP), LEN        // LEN = n
+	PXOR SUM, SUM             // SUM = 0
+	CMPQ LEN, $0
+	JE   dot_end
+
+	MOVQ ix+72(FP), INC_X        // INC_X = ix
+	MOVQ iy+80(FP), INC_Y        // INC_Y = iy
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
+
+	MOVQ incX+56(FP), INC_X // INC_X = incX * sizeof(float32)
+	SHLQ $2, INC_X
+	MOVQ incY+64(FP), INC_Y // INC_Y = incY * sizeof(float32)
+	SHLQ $2, INC_Y
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL  // TAIL = LEN % 4
+	SHRQ $2, LEN   // LEN = floor( LEN / 4 )
+	JZ   dot_tail  // if LEN == 0 { goto dot_tail }
+
+	PXOR P_SUM, P_SUM              // P_SUM = 0  for pipelining
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dot_loop: // Loop unrolled 4x  do {
+	CVTSS2SD (X_PTR), X2            // X_i = x[i:i+1]
+	CVTSS2SD (X_PTR)(INC_X*1), X3
+	CVTSS2SD (X_PTR)(INC_X*2), X4
+	CVTSS2SD (X_PTR)(INCx3_X*1), X5
+
+	CVTSS2SD (Y_PTR), X6            // X_j = y[i:i+1]
+	CVTSS2SD (Y_PTR)(INC_Y*1), X7
+	CVTSS2SD (Y_PTR)(INC_Y*2), X8
+	CVTSS2SD (Y_PTR)(INCx3_Y*1), X9
+
+	MULSD X6, X2 // X_i *= X_j
+	MULSD X7, X3
+	MULSD X8, X4
+	MULSD X9, X5
+
+	ADDSD X2, SUM   // SUM += X_i
+	ADDSD X3, P_SUM
+	ADDSD X4, SUM
+	ADDSD X5, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
+
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDSD P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	CVTSS2SD (X_PTR), X2  // X2 = x[i]
+	CVTSS2SD (Y_PTR), X3  // X2 *= y[i]
+	MULSD    X3, X2
+	ADDSD    X2, SUM      // SUM += X2
+	ADDQ     INC_X, X_PTR // X_PTR += INC_X
+	ADDQ     INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ     TAIL
+	JNZ      dot_tail     // } while --TAIL > 0
+
+dot_end:
+	MOVSD SUM, sum+88(FP) // return SUM
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ddotunitary_amd64.s
@@ -0,0 +1,110 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define HADDPD_SUM_SUM    LONG $0xC07C0F66 // @ HADDPD X0, X0
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define IDX AX
+#define SUM X0
+#define P_SUM X1
+
+// func DdotUnitary(x, y []float32) (sum float32)
+TEXT ·DdotUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	PXOR    SUM, SUM             // psum = 0
+	CMPQ    LEN, $0
+	JE      dot_end
+
+	XORQ IDX, IDX
+	MOVQ Y_PTR, DX
+	ANDQ $0xF, DX    // Align on 16-byte boundary for ADDPS
+	JZ   dot_no_trim // if DX == 0 { goto dot_no_trim }
+
+	SUBQ $16, DX
+
+dot_align: // Trim first value(s) in unaligned buffer  do {
+	CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
+	CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
+	MULSD    X3, X2
+	ADDSD    X2, SUM            // SUM += X2
+	INCQ     IDX                // IDX++
+	DECQ     LEN
+	JZ       dot_end            // if --TAIL == 0 { return }
+	ADDQ     $4, DX
+	JNZ      dot_align          // } while --LEN > 0
+
+dot_no_trim:
+	PXOR P_SUM, P_SUM   // P_SUM = 0  for pipelining
+	MOVQ LEN, TAIL
+	ANDQ $0x7, TAIL     // TAIL = LEN % 8
+	SHRQ $3, LEN        // LEN = floor( LEN / 8 )
+	JZ   dot_tail_start // if LEN == 0 { goto dot_tail_start }
+
+dot_loop: // Loop unrolled 8x  do {
+	CVTPS2PD (X_PTR)(IDX*4), X2   // X_i = x[i:i+1]
+	CVTPS2PD 8(X_PTR)(IDX*4), X3
+	CVTPS2PD 16(X_PTR)(IDX*4), X4
+	CVTPS2PD 24(X_PTR)(IDX*4), X5
+
+	CVTPS2PD (Y_PTR)(IDX*4), X6   // X_j = y[i:i+1]
+	CVTPS2PD 8(Y_PTR)(IDX*4), X7
+	CVTPS2PD 16(Y_PTR)(IDX*4), X8
+	CVTPS2PD 24(Y_PTR)(IDX*4), X9
+
+	MULPD X6, X2 // X_i *= X_j
+	MULPD X7, X3
+	MULPD X8, X4
+	MULPD X9, X5
+
+	ADDPD X2, SUM   // SUM += X_i
+	ADDPD X3, P_SUM
+	ADDPD X4, SUM
+	ADDPD X5, P_SUM
+
+	ADDQ $8, IDX  // IDX += 8
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDPD P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail_start:
+	MOVQ TAIL, LEN
+	SHRQ $1, LEN
+	JZ   dot_tail_one
+
+dot_tail_two:
+	CVTPS2PD (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
+	CVTPS2PD (Y_PTR)(IDX*4), X6 // X_j = y[i:i+1]
+	MULPD    X6, X2             // X_i *= X_j
+	ADDPD    X2, SUM            // SUM += X_i
+	ADDQ     $2, IDX            // IDX += 2
+	DECQ     LEN
+	JNZ      dot_tail_two       // } while --LEN > 0
+
+	ANDQ $1, TAIL
+	JZ   dot_end
+
+dot_tail_one:
+	CVTSS2SD (X_PTR)(IDX*4), X2 // X2 = float64(x[i])
+	CVTSS2SD (Y_PTR)(IDX*4), X3 // X3 = float64(y[i])
+	MULSD    X3, X2             // X2 *= X3
+	ADDSD    X2, SUM            // SUM += X2
+
+dot_end:
+	HADDPD_SUM_SUM        // SUM = \sum{ SUM[i] }
+	MOVSD SUM, sum+48(FP) // return SUM
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/doc.go
@@ -0,0 +1,6 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package f32 provides float32 vector primitives.
+package f32 // import "gonum.org/v1/gonum/internal/asm/f32"
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotinc_amd64.s
@@ -0,0 +1,85 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R10
+#define INC_Y R9
+#define INCx3_Y R11
+#define SUM X0
+#define P_SUM X1
+
+// func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
+TEXT ·DotInc(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR SUM, SUM             // SUM = 0
+	MOVQ n+48(FP), LEN        // LEN = n
+	CMPQ LEN, $0
+	JE   dot_end
+
+	MOVQ ix+72(FP), INC_X        // INC_X = ix
+	MOVQ iy+80(FP), INC_Y        // INC_Y = iy
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(x[ix])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(y[iy])
+
+	MOVQ incX+56(FP), INC_X // INC_X := incX * sizeof(float32)
+	SHLQ $2, INC_X
+	MOVQ incY+64(FP), INC_Y // INC_Y := incY * sizeof(float32)
+	SHLQ $2, INC_Y
+
+	MOVQ LEN, TAIL
+	ANDQ $0x3, TAIL // TAIL = LEN % 4
+	SHRQ $2, LEN    // LEN = floor( LEN / 4 )
+	JZ   dot_tail   // if LEN == 0 { goto dot_tail }
+
+	PXOR P_SUM, P_SUM              // P_SUM = 0  for pipelining
+	LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+dot_loop: // Loop unrolled 4x  do {
+	MOVSS (X_PTR), X2            // X_i = x[i:i+1]
+	MOVSS (X_PTR)(INC_X*1), X3
+	MOVSS (X_PTR)(INC_X*2), X4
+	MOVSS (X_PTR)(INCx3_X*1), X5
+
+	MULSS (Y_PTR), X2            // X_i *= y[i:i+1]
+	MULSS (Y_PTR)(INC_Y*1), X3
+	MULSS (Y_PTR)(INC_Y*2), X4
+	MULSS (Y_PTR)(INCx3_Y*1), X5
+
+	ADDSS X2, SUM   // SUM += X_i
+	ADDSS X3, P_SUM
+	ADDSS X4, SUM
+	ADDSS X5, P_SUM
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[INC_X * 4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[INC_Y * 4])
+
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDSS P_SUM, SUM // P_SUM += SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail: // do {
+	MOVSS (X_PTR), X2  // X2 = x[i]
+	MULSS (Y_PTR), X2  // X2 *= y[i]
+	ADDSS X2, SUM      // SUM += X2
+	ADDQ  INC_X, X_PTR // X_PTR += INC_X
+	ADDQ  INC_Y, Y_PTR // Y_PTR += INC_Y
+	DECQ  TAIL
+	JNZ   dot_tail     // } while --TAIL > 0
+
+dot_end:
+	MOVSS SUM, sum+88(FP) // return SUM
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/dotunitary_amd64.s
@@ -0,0 +1,106 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define HADDPS_SUM_SUM    LONG $0xC07C0FF2 // @ HADDPS X0, X0
+
+#define X_PTR SI
+#define Y_PTR DI
+#define LEN CX
+#define TAIL BX
+#define IDX AX
+#define SUM X0
+#define P_SUM X1
+
+// func DotUnitary(x, y []float32) (sum float32)
+TEXT ·DotUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+0(FP), X_PTR  // X_PTR = &x
+	MOVQ    y_base+24(FP), Y_PTR // Y_PTR = &y
+	PXOR    SUM, SUM             // SUM = 0
+	MOVQ    x_len+8(FP), LEN     // LEN = min( len(x), len(y) )
+	CMPQ    y_len+32(FP), LEN
+	CMOVQLE y_len+32(FP), LEN
+	CMPQ    LEN, $0
+	JE      dot_end
+
+	XORQ IDX, IDX
+	MOVQ Y_PTR, DX
+	ANDQ $0xF, DX    // Align on 16-byte boundary for MULPS
+	JZ   dot_no_trim // if DX == 0 { goto dot_no_trim }
+	SUBQ $16, DX
+
+dot_align: // Trim first value(s) in unaligned buffer  do {
+	MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
+	MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
+	ADDSS X2, SUM            // SUM += X2
+	INCQ  IDX                // IDX++
+	DECQ  LEN
+	JZ    dot_end            // if --TAIL == 0 { return }
+	ADDQ  $4, DX
+	JNZ   dot_align          // } while --DX > 0
+
+dot_no_trim:
+	PXOR P_SUM, P_SUM    // P_SUM = 0  for pipelining
+	MOVQ LEN, TAIL
+	ANDQ $0xF, TAIL      // TAIL = LEN % 16
+	SHRQ $4, LEN         // LEN = floor( LEN / 16 )
+	JZ   dot_tail4_start // if LEN == 0 { goto dot_tail4_start }
+
+dot_loop: // Loop unrolled 16x  do {
+	MOVUPS (X_PTR)(IDX*4), X2   // X_i = x[i:i+1]
+	MOVUPS 16(X_PTR)(IDX*4), X3
+	MOVUPS 32(X_PTR)(IDX*4), X4
+	MOVUPS 48(X_PTR)(IDX*4), X5
+
+	MULPS (Y_PTR)(IDX*4), X2   // X_i *= y[i:i+1]
+	MULPS 16(Y_PTR)(IDX*4), X3
+	MULPS 32(Y_PTR)(IDX*4), X4
+	MULPS 48(Y_PTR)(IDX*4), X5
+
+	ADDPS X2, SUM   // SUM += X_i
+	ADDPS X3, P_SUM
+	ADDPS X4, SUM
+	ADDPS X5, P_SUM
+
+	ADDQ $16, IDX // IDX += 16
+	DECQ LEN
+	JNZ  dot_loop // } while --LEN > 0
+
+	ADDPS P_SUM, SUM // SUM += P_SUM
+	CMPQ  TAIL, $0   // if TAIL == 0 { return }
+	JE    dot_end
+
+dot_tail4_start: // Reset loop counter for 4-wide tail loop
+	MOVQ TAIL, LEN      // LEN = floor( TAIL / 4 )
+	SHRQ $2, LEN
+	JZ   dot_tail_start // if LEN == 0 { goto dot_tail_start }
+
+dot_tail4_loop: // Loop unrolled 4x  do {
+	MOVUPS (X_PTR)(IDX*4), X2 // X_i = x[i:i+1]
+	MULPS  (Y_PTR)(IDX*4), X2 // X_i *= y[i:i+1]
+	ADDPS  X2, SUM            // SUM += X_i
+	ADDQ   $4, IDX            // i += 4
+	DECQ   LEN
+	JNZ    dot_tail4_loop     // } while --LEN > 0
+
+dot_tail_start: // Reset loop counter for 1-wide tail loop
+	ANDQ $3, TAIL // TAIL = TAIL % 4
+	JZ   dot_end  // if TAIL == 0 { return }
+
+dot_tail: // do {
+	MOVSS (X_PTR)(IDX*4), X2 // X2 = x[i]
+	MULSS (Y_PTR)(IDX*4), X2 // X2 *= y[i]
+	ADDSS X2, SUM            // psum += X2
+	INCQ  IDX                // IDX++
+	DECQ  TAIL
+	JNZ   dot_tail           // } while --TAIL > 0
+
+dot_end:
+	HADDPS_SUM_SUM        // SUM = \sum{ SUM[i] }
+	HADDPS_SUM_SUM
+	MOVSS SUM, sum+48(FP) // return SUM
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.go
@@ -0,0 +1,18 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package f32
+
+// Ger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(m, n uintptr, alpha float32,
+	x []float32, incX uintptr,
+	y []float32, incY uintptr,
+	a []float32, lda uintptr)
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_amd64.s
@@ -0,0 +1,757 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define SIZE 4
+#define BITSIZE 2
+#define KERNELSIZE 3
+
+#define M_DIM m+0(FP)
+#define M CX
+#define N_DIM n+8(FP)
+#define N BX
+
+#define TMP1 R14
+#define TMP2 R15
+
+#define X_PTR SI
+#define Y y_base+56(FP)
+#define Y_PTR DX
+#define A_ROW AX
+#define A_PTR DI
+
+#define INC_X R8
+#define INC3_X R9
+
+#define INC_Y R10
+#define INC3_Y R11
+
+#define LDA R12
+#define LDA3 R13
+
+#define ALPHA X0
+#define ALPHA_SPILL al-16(SP)
+
+#define LOAD_ALPHA \
+	MOVSS  alpha+16(FP), ALPHA \
+	SHUFPS $0, ALPHA, ALPHA
+
+#define LOAD_SCALED4 \
+	PREFETCHNTA 16*SIZE(X_PTR)    \
+	MOVDDUP     (X_PTR), X1       \
+	MOVDDUP     2*SIZE(X_PTR), X3 \
+	MOVSHDUP    X1, X2            \
+	MOVSHDUP    X3, X4            \
+	MOVSLDUP    X1, X1            \
+	MOVSLDUP    X3, X3            \
+	MULPS       ALPHA, X1         \
+	MULPS       ALPHA, X2         \
+	MULPS       ALPHA, X3         \
+	MULPS       ALPHA, X4
+
+#define LOAD_SCALED2 \
+	MOVDDUP  (X_PTR), X1 \
+	MOVSHDUP X1, X2      \
+	MOVSLDUP X1, X1      \
+	MULPS    ALPHA, X1   \
+	MULPS    ALPHA, X2
+
+#define LOAD_SCALED1 \
+	MOVSS  (X_PTR), X1 \
+	SHUFPS $0, X1, X1  \
+	MULPS  ALPHA, X1
+
+#define LOAD_SCALED4_INC \
+	PREFETCHNTA (X_PTR)(INC_X*8)      \
+	MOVSS       (X_PTR), X1           \
+	MOVSS       (X_PTR)(INC_X*1), X2  \
+	MOVSS       (X_PTR)(INC_X*2), X3  \
+	MOVSS       (X_PTR)(INC3_X*1), X4 \
+	SHUFPS      $0, X1, X1            \
+	SHUFPS      $0, X2, X2            \
+	SHUFPS      $0, X3, X3            \
+	SHUFPS      $0, X4, X4            \
+	MULPS       ALPHA, X1             \
+	MULPS       ALPHA, X2             \
+	MULPS       ALPHA, X3             \
+	MULPS       ALPHA, X4
+
+#define LOAD_SCALED2_INC \
+	MOVSS  (X_PTR), X1          \
+	MOVSS  (X_PTR)(INC_X*1), X2 \
+	SHUFPS $0, X1, X1           \
+	SHUFPS $0, X2, X2           \
+	MULPS  ALPHA, X1            \
+	MULPS  ALPHA, X2
+
+#define KERNEL_LOAD8 \
+	MOVUPS (Y_PTR), X5       \
+	MOVUPS 4*SIZE(Y_PTR), X6
+
+#define KERNEL_LOAD8_INC \
+	MOVSS    (Y_PTR), X5             \
+	MOVSS    (Y_PTR)(INC_Y*1), X6    \
+	MOVSS    (Y_PTR)(INC_Y*2), X7    \
+	MOVSS    (Y_PTR)(INC3_Y*1), X8   \
+	UNPCKLPS X6, X5                  \
+	UNPCKLPS X8, X7                  \
+	MOVLHPS  X7, X5                  \
+	LEAQ     (Y_PTR)(INC_Y*4), Y_PTR \
+	MOVSS    (Y_PTR), X6             \
+	MOVSS    (Y_PTR)(INC_Y*1), X7    \
+	MOVSS    (Y_PTR)(INC_Y*2), X8    \
+	MOVSS    (Y_PTR)(INC3_Y*1), X9   \
+	UNPCKLPS X7, X6                  \
+	UNPCKLPS X9, X8                  \
+	MOVLHPS  X8, X6
+
+#define KERNEL_LOAD4 \
+	MOVUPS (Y_PTR), X5
+
+#define KERNEL_LOAD4_INC \
+	MOVSS    (Y_PTR), X5           \
+	MOVSS    (Y_PTR)(INC_Y*1), X6  \
+	MOVSS    (Y_PTR)(INC_Y*2), X7  \
+	MOVSS    (Y_PTR)(INC3_Y*1), X8 \
+	UNPCKLPS X6, X5                \
+	UNPCKLPS X8, X7                \
+	MOVLHPS  X7, X5
+
+#define KERNEL_LOAD2 \
+	MOVSD (Y_PTR), X5
+
+#define KERNEL_LOAD2_INC \
+	MOVSS    (Y_PTR), X5          \
+	MOVSS    (Y_PTR)(INC_Y*1), X6 \
+	UNPCKLPS X6, X5
+
+#define KERNEL_4x8 \
+	MOVUPS X5, X7  \
+	MOVUPS X6, X8  \
+	MOVUPS X5, X9  \
+	MOVUPS X6, X10 \
+	MOVUPS X5, X11 \
+	MOVUPS X6, X12 \
+	MULPS  X1, X5  \
+	MULPS  X1, X6  \
+	MULPS  X2, X7  \
+	MULPS  X2, X8  \
+	MULPS  X3, X9  \
+	MULPS  X3, X10 \
+	MULPS  X4, X11 \
+	MULPS  X4, X12
+
+#define STORE_4x8 \
+	MOVUPS ALPHA, ALPHA_SPILL         \
+	MOVUPS (A_PTR), X13               \
+	ADDPS  X13, X5                    \
+	MOVUPS 4*SIZE(A_PTR), X14         \
+	ADDPS  X14, X6                    \
+	MOVUPS (A_PTR)(LDA*1), X15        \
+	ADDPS  X15, X7                    \
+	MOVUPS 4*SIZE(A_PTR)(LDA*1), X0   \
+	ADDPS  X0, X8                     \
+	MOVUPS (A_PTR)(LDA*2), X13        \
+	ADDPS  X13, X9                    \
+	MOVUPS 4*SIZE(A_PTR)(LDA*2), X14  \
+	ADDPS  X14, X10                   \
+	MOVUPS (A_PTR)(LDA3*1), X15       \
+	ADDPS  X15, X11                   \
+	MOVUPS 4*SIZE(A_PTR)(LDA3*1), X0  \
+	ADDPS  X0, X12                    \
+	MOVUPS X5, (A_PTR)                \
+	MOVUPS X6, 4*SIZE(A_PTR)          \
+	MOVUPS X7, (A_PTR)(LDA*1)         \
+	MOVUPS X8, 4*SIZE(A_PTR)(LDA*1)   \
+	MOVUPS X9, (A_PTR)(LDA*2)         \
+	MOVUPS X10, 4*SIZE(A_PTR)(LDA*2)  \
+	MOVUPS X11, (A_PTR)(LDA3*1)       \
+	MOVUPS X12, 4*SIZE(A_PTR)(LDA3*1) \
+	MOVUPS ALPHA_SPILL, ALPHA         \
+	ADDQ   $8*SIZE, A_PTR
+
+#define KERNEL_4x4 \
+	MOVUPS X5, X6 \
+	MOVUPS X5, X7 \
+	MOVUPS X5, X8 \
+	MULPS  X1, X5 \
+	MULPS  X2, X6 \
+	MULPS  X3, X7 \
+	MULPS  X4, X8
+
+#define STORE_4x4 \
+	MOVUPS (A_PTR), X13         \
+	ADDPS  X13, X5              \
+	MOVUPS (A_PTR)(LDA*1), X14  \
+	ADDPS  X14, X6              \
+	MOVUPS (A_PTR)(LDA*2), X15  \
+	ADDPS  X15, X7              \
+	MOVUPS (A_PTR)(LDA3*1), X13 \
+	ADDPS  X13, X8              \
+	MOVUPS X5, (A_PTR)          \
+	MOVUPS X6, (A_PTR)(LDA*1)   \
+	MOVUPS X7, (A_PTR)(LDA*2)   \
+	MOVUPS X8, (A_PTR)(LDA3*1)  \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_4x2 \
+	MOVUPS X5, X6 \
+	MOVUPS X5, X7 \
+	MOVUPS X5, X8 \
+	MULPS  X1, X5 \
+	MULPS  X2, X6 \
+	MULPS  X3, X7 \
+	MULPS  X4, X8
+
+#define STORE_4x2 \
+	MOVSD (A_PTR), X9          \
+	ADDPS X9, X5               \
+	MOVSD (A_PTR)(LDA*1), X10  \
+	ADDPS X10, X6              \
+	MOVSD (A_PTR)(LDA*2), X11  \
+	ADDPS X11, X7              \
+	MOVSD (A_PTR)(LDA3*1), X12 \
+	ADDPS X12, X8              \
+	MOVSD X5, (A_PTR)          \
+	MOVSD X6, (A_PTR)(LDA*1)   \
+	MOVSD X7, (A_PTR)(LDA*2)   \
+	MOVSD X8, (A_PTR)(LDA3*1)  \
+	ADDQ  $2*SIZE, A_PTR
+
+#define KERNEL_4x1 \
+	MOVSS (Y_PTR), X5 \
+	MOVSS X5, X6      \
+	MOVSS X5, X7      \
+	MOVSS X5, X8      \
+	MULSS X1, X5      \
+	MULSS X2, X6      \
+	MULSS X3, X7      \
+	MULSS X4, X8
+
+#define STORE_4x1 \
+	ADDSS (A_PTR), X5         \
+	ADDSS (A_PTR)(LDA*1), X6  \
+	ADDSS (A_PTR)(LDA*2), X7  \
+	ADDSS (A_PTR)(LDA3*1), X8 \
+	MOVSS X5, (A_PTR)         \
+	MOVSS X6, (A_PTR)(LDA*1)  \
+	MOVSS X7, (A_PTR)(LDA*2)  \
+	MOVSS X8, (A_PTR)(LDA3*1) \
+	ADDQ  $SIZE, A_PTR
+
+#define KERNEL_2x8 \
+	MOVUPS X5, X7 \
+	MOVUPS X6, X8 \
+	MULPS  X1, X5 \
+	MULPS  X1, X6 \
+	MULPS  X2, X7 \
+	MULPS  X2, X8
+
+#define STORE_2x8 \
+	MOVUPS (A_PTR), X9               \
+	ADDPS  X9, X5                    \
+	MOVUPS 4*SIZE(A_PTR), X10        \
+	ADDPS  X10, X6                   \
+	MOVUPS (A_PTR)(LDA*1), X11       \
+	ADDPS  X11, X7                   \
+	MOVUPS 4*SIZE(A_PTR)(LDA*1), X12 \
+	ADDPS  X12, X8                   \
+	MOVUPS X5, (A_PTR)               \
+	MOVUPS X6, 4*SIZE(A_PTR)         \
+	MOVUPS X7, (A_PTR)(LDA*1)        \
+	MOVUPS X8, 4*SIZE(A_PTR)(LDA*1)  \
+	ADDQ   $8*SIZE, A_PTR
+
+#define KERNEL_2x4 \
+	MOVUPS X5, X6 \
+	MULPS  X1, X5 \
+	MULPS  X2, X6
+
+#define STORE_2x4 \
+	MOVUPS (A_PTR), X9         \
+	ADDPS  X9, X5              \
+	MOVUPS (A_PTR)(LDA*1), X11 \
+	ADDPS  X11, X6             \
+	MOVUPS X5, (A_PTR)         \
+	MOVUPS X6, (A_PTR)(LDA*1)  \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_2x2 \
+	MOVSD X5, X6 \
+	MULPS X1, X5 \
+	MULPS X2, X6
+
+#define STORE_2x2 \
+	MOVSD (A_PTR), X7        \
+	ADDPS X7, X5             \
+	MOVSD (A_PTR)(LDA*1), X8 \
+	ADDPS X8, X6             \
+	MOVSD X5, (A_PTR)        \
+	MOVSD X6, (A_PTR)(LDA*1) \
+	ADDQ  $2*SIZE, A_PTR
+
+#define KERNEL_2x1 \
+	MOVSS (Y_PTR), X5 \
+	MOVSS X5, X6      \
+	MULSS X1, X5      \
+	MULSS X2, X6
+
+#define STORE_2x1 \
+	ADDSS (A_PTR), X5        \
+	ADDSS (A_PTR)(LDA*1), X6 \
+	MOVSS X5, (A_PTR)        \
+	MOVSS X6, (A_PTR)(LDA*1) \
+	ADDQ  $SIZE, A_PTR
+
+#define KERNEL_1x8 \
+	MULPS X1, X5 \
+	MULPS X1, X6
+
+#define STORE_1x8 \
+	MOVUPS (A_PTR), X7       \
+	ADDPS  X7, X5            \
+	MOVUPS 4*SIZE(A_PTR), X8 \
+	ADDPS  X8, X6            \
+	MOVUPS X5, (A_PTR)       \
+	MOVUPS X6, 4*SIZE(A_PTR) \
+	ADDQ   $8*SIZE, A_PTR
+
+#define KERNEL_1x4 \
+	MULPS X1, X5 \
+	MULPS X1, X6
+
+#define STORE_1x4 \
+	MOVUPS (A_PTR), X7    \
+	ADDPS  X7, X5         \
+	MOVUPS X5, (A_PTR)    \
+	ADDQ   $4*SIZE, A_PTR
+
+#define KERNEL_1x2 \
+	MULPS X1, X5
+
+#define STORE_1x2 \
+	MOVSD (A_PTR), X6    \
+	ADDPS X6, X5         \
+	MOVSD X5, (A_PTR)    \
+	ADDQ  $2*SIZE, A_PTR
+
+#define KERNEL_1x1 \
+	MOVSS (Y_PTR), X5 \
+	MULSS X1, X5
+
+#define STORE_1x1 \
+	ADDSS (A_PTR), X5  \
+	MOVSS X5, (A_PTR)  \
+	ADDQ  $SIZE, A_PTR
+
+// func Ger(m, n uintptr, alpha float32,
+//	x []float32, incX uintptr,
+//	y []float32, incY uintptr,
+//	a []float32, lda uintptr)
+TEXT ·Ger(SB), 0, $16-120
+	MOVQ M_DIM, M
+	MOVQ N_DIM, N
+	CMPQ M, $0
+	JE   end
+	CMPQ N, $0
+	JE   end
+
+	LOAD_ALPHA
+
+	MOVQ x_base+24(FP), X_PTR
+	MOVQ y_base+56(FP), Y_PTR
+	MOVQ a_base+88(FP), A_ROW
+	MOVQ A_ROW, A_PTR
+	MOVQ lda+112(FP), LDA     // LDA = LDA * sizeof(float32)
+	SHLQ $BITSIZE, LDA
+	LEAQ (LDA)(LDA*2), LDA3   // LDA3 = LDA * 3
+
+	CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path)
+	JNE  inc
+	CMPQ incX+48(FP), $1 // Check for dense vector X (fast-path)
+	JNE  inc
+
+	SHRQ $2, M
+	JZ   r2
+
+r4:
+
+	// LOAD 4
+	LOAD_SCALED4
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   r4c4
+
+r4c8:
+	// 4x8 KERNEL
+	KERNEL_LOAD8
+	KERNEL_4x8
+	STORE_4x8
+
+	ADDQ $8*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r4c8
+
+r4c4:
+	TESTQ $4, N_DIM
+	JZ    r4c2
+
+	// 4x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_4x4
+	STORE_4x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+r4c2:
+	TESTQ $2, N_DIM
+	JZ    r4c1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_4x2
+	STORE_4x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r4c1:
+	TESTQ $1, N_DIM
+	JZ    r4end
+
+	// 4x1 KERNEL
+	KERNEL_4x1
+	STORE_4x1
+
+	ADDQ $SIZE, Y_PTR
+
+r4end:
+	ADDQ $4*SIZE, X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ M
+	JNZ  r4
+
+r2:
+	TESTQ $2, M_DIM
+	JZ    r1
+
+	// LOAD 2
+	LOAD_SCALED2
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   r2c4
+
+r2c8:
+	// 2x8 KERNEL
+	KERNEL_LOAD8
+	KERNEL_2x8
+	STORE_2x8
+
+	ADDQ $8*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r2c8
+
+r2c4:
+	TESTQ $4, N_DIM
+	JZ    r2c2
+
+	// 2x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_2x4
+	STORE_2x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+r2c2:
+	TESTQ $2, N_DIM
+	JZ    r2c1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_2x2
+	STORE_2x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r2c1:
+	TESTQ $1, N_DIM
+	JZ    r2end
+
+	// 2x1 KERNEL
+	KERNEL_2x1
+	STORE_2x1
+
+	ADDQ $SIZE, Y_PTR
+
+r2end:
+	ADDQ $2*SIZE, X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+r1:
+	TESTQ $1, M_DIM
+	JZ    end
+
+	// LOAD 1
+	LOAD_SCALED1
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   r1c4
+
+r1c8:
+	// 1x8 KERNEL
+	KERNEL_LOAD8
+	KERNEL_1x8
+	STORE_1x8
+
+	ADDQ $8*SIZE, Y_PTR
+
+	DECQ N
+	JNZ  r1c8
+
+r1c4:
+	TESTQ $4, N_DIM
+	JZ    r1c2
+
+	// 1x4 KERNEL
+	KERNEL_LOAD4
+	KERNEL_1x4
+	STORE_1x4
+
+	ADDQ $4*SIZE, Y_PTR
+
+r1c2:
+	TESTQ $2, N_DIM
+	JZ    r1c1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2
+	KERNEL_1x2
+	STORE_1x2
+
+	ADDQ $2*SIZE, Y_PTR
+
+r1c1:
+	TESTQ $1, N_DIM
+	JZ    end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+	STORE_1x1
+
+end:
+	RET
+
+inc:  // Algorithm for incY != 0 ( split loads in kernel )
+
+	MOVQ incX+48(FP), INC_X       // INC_X = incX * sizeof(float32)
+	SHLQ $BITSIZE, INC_X
+	MOVQ incY+80(FP), INC_Y       // INC_Y = incY * sizeof(float32)
+	SHLQ $BITSIZE, INC_Y
+	LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3
+	LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3
+
+	XORQ    TMP2, TMP2
+	MOVQ    M, TMP1
+	SUBQ    $1, TMP1
+	IMULQ   INC_X, TMP1
+	NEGQ    TMP1
+	CMPQ    INC_X, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (X_PTR)(TMP2*SIZE), X_PTR
+
+	XORQ    TMP2, TMP2
+	MOVQ    N, TMP1
+	SUBQ    $1, TMP1
+	IMULQ   INC_Y, TMP1
+	NEGQ    TMP1
+	CMPQ    INC_Y, $0
+	CMOVQLT TMP1, TMP2
+	LEAQ    (Y_PTR)(TMP2*SIZE), Y_PTR
+
+	SHRQ $2, M
+	JZ   inc_r2
+
+inc_r4:
+	// LOAD 4
+	LOAD_SCALED4_INC
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   inc_r4c4
+
+inc_r4c8:
+	// 4x4 KERNEL
+	KERNEL_LOAD8_INC
+	KERNEL_4x8
+	STORE_4x8
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r4c8
+
+inc_r4c4:
+	TESTQ $4, N_DIM
+	JZ    inc_r4c2
+
+	// 4x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_4x4
+	STORE_4x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+inc_r4c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r4c1
+
+	// 4x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_4x2
+	STORE_4x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r4c1:
+	TESTQ $1, N_DIM
+	JZ    inc_r4end
+
+	// 4x1 KERNEL
+	KERNEL_4x1
+	STORE_4x1
+
+	ADDQ INC_Y, Y_PTR
+
+inc_r4end:
+	LEAQ (X_PTR)(INC_X*4), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*4), A_ROW
+	MOVQ A_ROW, A_PTR
+
+	DECQ M
+	JNZ  inc_r4
+
+inc_r2:
+	TESTQ $2, M_DIM
+	JZ    inc_r1
+
+	// LOAD 2
+	LOAD_SCALED2_INC
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   inc_r2c4
+
+inc_r2c8:
+	// 2x8 KERNEL
+	KERNEL_LOAD8_INC
+	KERNEL_2x8
+	STORE_2x8
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r2c8
+
+inc_r2c4:
+	TESTQ $4, N_DIM
+	JZ    inc_r2c2
+
+	// 2x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_2x4
+	STORE_2x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+inc_r2c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r2c1
+
+	// 2x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_2x2
+	STORE_2x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r2c1:
+	TESTQ $1, N_DIM
+	JZ    inc_r2end
+
+	// 2x1 KERNEL
+	KERNEL_2x1
+	STORE_2x1
+
+	ADDQ INC_Y, Y_PTR
+
+inc_r2end:
+	LEAQ (X_PTR)(INC_X*2), X_PTR
+	MOVQ Y, Y_PTR
+	LEAQ (A_ROW)(LDA*2), A_ROW
+	MOVQ A_ROW, A_PTR
+
+inc_r1:
+	TESTQ $1, M_DIM
+	JZ    end
+
+	// LOAD 1
+	LOAD_SCALED1
+
+	MOVQ N_DIM, N
+	SHRQ $KERNELSIZE, N
+	JZ   inc_r1c4
+
+inc_r1c8:
+	// 1x8 KERNEL
+	KERNEL_LOAD8_INC
+	KERNEL_1x8
+	STORE_1x8
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+	DECQ N
+	JNZ  inc_r1c8
+
+inc_r1c4:
+	TESTQ $4, N_DIM
+	JZ    inc_r1c2
+
+	// 1x4 KERNEL
+	KERNEL_LOAD4_INC
+	KERNEL_1x4
+	STORE_1x4
+
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR
+
+inc_r1c2:
+	TESTQ $2, N_DIM
+	JZ    inc_r1c1
+
+	// 1x2 KERNEL
+	KERNEL_LOAD2_INC
+	KERNEL_1x2
+	STORE_1x2
+
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR
+
+inc_r1c1:
+	TESTQ $1, N_DIM
+	JZ    inc_end
+
+	// 1x1 KERNEL
+	KERNEL_1x1
+	STORE_1x1
+
+inc_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/ge_noasm.go
@@ -0,0 +1,39 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f32
+
+// Ger performs the rank-one operation
+//
+//	A += alpha * x * yᵀ
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar.
+func Ger(m, n uintptr, alpha float32, x []float32, incX uintptr, y []float32, incY uintptr, a []float32, lda uintptr) {
+
+	if incX == 1 && incY == 1 {
+		x = x[:m]
+		y = y[:n]
+		for i, xv := range x {
+			AxpyUnitary(alpha*xv, y, a[uintptr(i)*lda:uintptr(i)*lda+n])
+		}
+		return
+	}
+
+	var ky, kx uintptr
+	if int(incY) < 0 {
+		ky = uintptr(-int(n-1) * int(incY))
+	}
+	if int(incX) < 0 {
+		kx = uintptr(-int(m-1) * int(incX))
+	}
+
+	ix := kx
+	for i := 0; i < int(m); i++ {
+		AxpyInc(alpha*x[ix], y, a[uintptr(i)*lda:uintptr(i)*lda+n], uintptr(n), uintptr(incY), 1, uintptr(ky), 0)
+		ix += incX
+	}
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/gemv.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/gemv.go
@@ -0,0 +1,92 @@
+// Copyright ©2017 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package f32
+
+// GemvN computes
+//
+//	y = alpha * A * x + beta * y
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func GemvN(m, n uintptr, alpha float32, a []float32, lda uintptr, x []float32, incX uintptr, beta float32, y []float32, incY uintptr) {
+	var kx, ky, i uintptr
+	if int(incX) < 0 {
+		kx = uintptr(-int(n-1) * int(incX))
+	}
+	if int(incY) < 0 {
+		ky = uintptr(-int(m-1) * int(incY))
+	}
+
+	if incX == 1 && incY == 1 {
+		if beta == 0 {
+			for i = 0; i < m; i++ {
+				y[i] = alpha * DotUnitary(a[lda*i:lda*i+n], x)
+			}
+			return
+		}
+		for i = 0; i < m; i++ {
+			y[i] = y[i]*beta + alpha*DotUnitary(a[lda*i:lda*i+n], x)
+		}
+		return
+	}
+	iy := ky
+	if beta == 0 {
+		for i = 0; i < m; i++ {
+			y[iy] = alpha * DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0)
+			iy += incY
+		}
+		return
+	}
+	for i = 0; i < m; i++ {
+		y[iy] = y[iy]*beta + alpha*DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0)
+		iy += incY
+	}
+}
+
+// GemvT computes
+//
+//	y = alpha * Aᵀ * x + beta * y
+//
+// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars.
+func GemvT(m, n uintptr, alpha float32, a []float32, lda uintptr, x []float32, incX uintptr, beta float32, y []float32, incY uintptr) {
+	var kx, ky, i uintptr
+	if int(incX) < 0 {
+		kx = uintptr(-int(m-1) * int(incX))
+	}
+	if int(incY) < 0 {
+		ky = uintptr(-int(n-1) * int(incY))
+	}
+	switch {
+	case beta == 0: // beta == 0 is special-cased to memclear
+		if incY == 1 {
+			for i := range y {
+				y[i] = 0
+			}
+		} else {
+			iy := ky
+			for i := 0; i < int(n); i++ {
+				y[iy] = 0
+				iy += incY
+			}
+		}
+	case int(incY) < 0:
+		ScalInc(beta, y, n, uintptr(int(-incY)))
+	case incY == 1:
+		ScalUnitary(beta, y[:n])
+	default:
+		ScalInc(beta, y, n, incY)
+	}
+
+	if incX == 1 && incY == 1 {
+		for i = 0; i < m; i++ {
+			AxpyUnitaryTo(y, alpha*x[i], a[lda*i:lda*i+n], y)
+		}
+		return
+	}
+	ix := kx
+	for i = 0; i < m; i++ {
+		AxpyInc(alpha*x[ix], a[lda*i:lda*i+n], y, n, 1, incY, 0, ky)
+		ix += incX
+	}
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/l2norm.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/l2norm.go
@@ -0,0 +1,90 @@
+// Copyright ©2019 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package f32
+
+import "gonum.org/v1/gonum/internal/math32"
+
+// L2NormUnitary is the level 2 norm of x.
+func L2NormUnitary(x []float32) (sum float32) {
+	var scale float32
+	var sumSquares float32 = 1
+	for _, v := range x {
+		if v == 0 {
+			continue
+		}
+		absxi := math32.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// L2NormInc is the level 2 norm of x.
+func L2NormInc(x []float32, n, incX uintptr) (sum float32) {
+	var scale float32
+	var sumSquares float32 = 1
+	for ix := uintptr(0); ix < n*incX; ix += incX {
+		val := x[ix]
+		if val == 0 {
+			continue
+		}
+		absxi := math32.Abs(val)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
+
+// L2DistanceUnitary is the L2 norm of x-y.
+func L2DistanceUnitary(x, y []float32) (sum float32) {
+	var scale float32
+	var sumSquares float32 = 1
+	for i, v := range x {
+		v -= y[i]
+		if v == 0 {
+			continue
+		}
+		absxi := math32.Abs(v)
+		if math32.IsNaN(absxi) {
+			return math32.NaN()
+		}
+		if scale < absxi {
+			s := scale / absxi
+			sumSquares = 1 + sumSquares*s*s
+			scale = absxi
+		} else {
+			s := absxi / scale
+			sumSquares += s * s
+		}
+	}
+	if math32.IsInf(scale, 1) {
+		return math32.Inf(1)
+	}
+	return scale * math32.Sqrt(sumSquares)
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/scal.go
@@ -0,0 +1,59 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package f32
+
+// ScalUnitary is
+//
+//	for i := range x {
+//		x[i] *= alpha
+//	}
+func ScalUnitary(alpha float32, x []float32) {
+	for i := range x {
+		x[i] *= alpha
+	}
+}
+
+// ScalUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha * v
+//	}
+func ScalUnitaryTo(dst []float32, alpha float32, x []float32) {
+	for i, v := range x {
+		dst[i] = alpha * v
+	}
+}
+
+// ScalInc is
+//
+//	var ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		x[ix] *= alpha
+//		ix += incX
+//	}
+func ScalInc(alpha float32, x []float32, n, incX uintptr) {
+	var ix uintptr
+	for i := 0; i < int(n); i++ {
+		x[ix] *= alpha
+		ix += incX
+	}
+}
+
+// ScalIncTo is
+//
+//	var idst, ix uintptr
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha * x[ix]
+//		ix += incX
+//		idst += incDst
+//	}
+func ScalIncTo(dst []float32, incDst uintptr, alpha float32, x []float32, n, incX uintptr) {
+	var idst, ix uintptr
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha * x[ix]
+		ix += incX
+		idst += incDst
+	}
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_amd64.go
@@ -0,0 +1,86 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !noasm && !gccgo && !safe
+// +build !noasm,!gccgo,!safe
+
+package f32
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha float32, x, y []float32)
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
+
+// DdotUnitary is
+//
+//	for i, v := range x {
+//		sum += float64(y[i]) * float64(v)
+//	}
+//	return
+func DdotUnitary(x, y []float32) (sum float64)
+
+// DdotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += float64(y[iy]) * float64(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return
+func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64)
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotUnitary(x, y []float32) (sum float32)
+
+// DotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32)
+
+// Sum is
+//
+//	 var sum float32
+//	 for _, v := range x {
+//			sum += v
+//	 }
+//	 return sum
+func Sum(x []float32) float32
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/stubs_noasm.go
@@ -0,0 +1,137 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f32
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha float32, x, y []float32) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
+
+// DotUnitary is
+//
+//	for i, v := range x {
+//		sum += y[i] * v
+//	}
+//	return sum
+func DotUnitary(x, y []float32) (sum float32) {
+	for i, v := range x {
+		sum += y[i] * v
+	}
+	return sum
+}
+
+// DotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += y[iy] * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+//	return sum
+func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32) {
+	for i := 0; i < int(n); i++ {
+		sum += y[iy] * x[ix]
+		ix += incX
+		iy += incY
+	}
+	return sum
+}
+
+// DdotUnitary is
+//
+//	for i, v := range x {
+//		sum += float64(y[i]) * float64(v)
+//	}
+//	return
+func DdotUnitary(x, y []float32) (sum float64) {
+	for i, v := range x {
+		sum += float64(y[i]) * float64(v)
+	}
+	return
+}
+
+// DdotInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		sum += float64(y[iy]) * float64(x[ix])
+//		ix += incX
+//		iy += incY
+//	}
+//	return
+func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64) {
+	for i := 0; i < int(n); i++ {
+		sum += float64(y[iy]) * float64(x[ix])
+		ix += incX
+		iy += incY
+	}
+	return
+}
+
+// Sum is
+//
+//	var sum float32
+//	for _, v := range x {
+//		sum += v
+//	}
+//	return sum
+func Sum(x []float32) float32 {
+	var sum float32
+	for _, v := range x {
+		sum += v
+	}
+	return sum
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/f32/sum_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f32/sum_amd64.s
@@ -0,0 +1,100 @@
+// Copyright ©2021 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define SUM X0
+#define SUM_1 X1
+#define SUM_2 X2
+#define SUM_3 X3
+
+// func Sum(x []float32) float32
+TEXT ·Sum(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), X_PTR // X_PTR = &x
+	MOVQ x_len+8(FP), LEN    // LEN = len(x)
+	XORQ IDX, IDX            // i = 0
+	PXOR SUM, SUM            // p_sum_i = 0
+	CMPQ LEN, $0             // if LEN == 0 { return 0 }
+	JE   sum_end
+
+	PXOR SUM_1, SUM_1
+	PXOR SUM_2, SUM_2
+	PXOR SUM_3, SUM_3
+
+	MOVQ X_PTR, TAIL // Check memory alignment
+	ANDQ $15, TAIL   // TAIL = &x % 16
+	JZ   no_trim     // if TAIL == 0 { goto no_trim }
+	SUBQ $16, TAIL   // TAIL -= 16
+
+sum_align: // Align on 16-byte boundary do {
+	ADDSS (X_PTR)(IDX*4), SUM // SUM += x[0]
+	INCQ  IDX                 // i++
+	DECQ  LEN                 // LEN--
+	JZ    sum_end             // if LEN == 0 { return }
+	ADDQ  $4, TAIL            // TAIL += 4
+	JNZ   sum_align           // } while TAIL < 0
+
+no_trim:
+	MOVQ LEN, TAIL
+	SHRQ $4, LEN   // LEN = floor( n / 16 )
+	JZ   sum_tail8 // if LEN == 0 { goto sum_tail8 }
+
+
+sum_loop: // sum 16x wide do {
+	ADDPS (X_PTR)(IDX*4), SUM     // sum_i += x[i:i+4]
+	ADDPS 16(X_PTR)(IDX*4), SUM_1
+	ADDPS 32(X_PTR)(IDX*4), SUM_2
+	ADDPS 48(X_PTR)(IDX*4), SUM_3
+
+	ADDQ  $16, IDX                // i += 16
+	DECQ  LEN
+	JNZ   sum_loop                // } while --LEN > 0
+
+sum_tail8:
+	ADDPS SUM_3, SUM
+	ADDPS SUM_2, SUM_1
+
+	TESTQ $8, TAIL
+	JZ    sum_tail4
+
+	ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
+	ADDPS 16(X_PTR)(IDX*4), SUM_1
+	ADDQ  $8, IDX
+
+sum_tail4:
+	ADDPS SUM_1, SUM
+
+	TESTQ $4, TAIL
+	JZ    sum_tail2
+
+	ADDPS (X_PTR)(IDX*4), SUM // sum_i += x[i:i+4]
+	ADDQ  $4, IDX
+
+sum_tail2:
+	HADDPS SUM, SUM            // sum_i[:2] += sum_i[2:4]
+
+	TESTQ $2, TAIL
+	JZ    sum_tail1
+
+	MOVSD (X_PTR)(IDX*4), SUM_1 // reuse SUM_1
+	ADDPS SUM_1, SUM            // sum_i += x[i:i+2]
+	ADDQ  $2, IDX
+
+sum_tail1:
+	HADDPS SUM, SUM // sum_i[0] += sum_i[1]
+
+	TESTQ $1, TAIL
+	JZ    sum_end
+
+	ADDSS (X_PTR)(IDX*4), SUM
+
+sum_end: // return sum
+	MOVSS SUM, ret+24(FP)
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f64/abssum_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/abssum_amd64.s
@@ -0,0 +1,82 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func L1Norm(x []float64) float64
+TEXT ·L1Norm(SB), NOSPLIT, $0
+	MOVQ x_base+0(FP), SI // SI = &x
+	MOVQ x_len+8(FP), CX  // CX = len(x)
+	XORQ AX, AX           // i = 0
+	PXOR X0, X0           // p_sum_i = 0
+	PXOR X1, X1
+	PXOR X2, X2
+	PXOR X3, X3
+	PXOR X4, X4
+	PXOR X5, X5
+	PXOR X6, X6
+	PXOR X7, X7
+	CMPQ CX, $0           // if CX == 0 { return 0 }
+	JE   absum_end
+	MOVQ CX, BX
+	ANDQ $7, BX           // BX = len(x) % 8
+	SHRQ $3, CX           // CX = floor( len(x) / 8 )
+	JZ   absum_tail_start // if CX == 0 { goto absum_tail_start }
+
+absum_loop: // do {
+	// p_sum += max( p_sum + x[i], p_sum - x[i] )
+	MOVUPS (SI)(AX*8), X8    // X_i = x[i:i+1]
+	MOVUPS 16(SI)(AX*8), X9
+	MOVUPS 32(SI)(AX*8), X10
+	MOVUPS 48(SI)(AX*8), X11
+	ADDPD  X8, X0            // p_sum_i += X_i  ( positive values )
+	ADDPD  X9, X2
+	ADDPD  X10, X4
+	ADDPD  X11, X6
+	SUBPD  X8, X1            // p_sum_(i+1) -= X_i  ( negative values )
+	SUBPD  X9, X3
+	SUBPD  X10, X5
+	SUBPD  X11, X7
+	MAXPD  X1, X0            // p_sum_i = max( p_sum_i, p_sum_(i+1) )
+	MAXPD  X3, X2
+	MAXPD  X5, X4
+	MAXPD  X7, X6
+	MOVAPS X0, X1            // p_sum_(i+1) = p_sum_i
+	MOVAPS X2, X3
+	MOVAPS X4, X5
+	MOVAPS X6, X7
+	ADDQ   $8, AX            // i += 8
+	LOOP   absum_loop        // } while --CX > 0
+
+	// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
+	ADDPD X3, X0
+	ADDPD X5, X7
+	ADDPD X7, X0
+
+	// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
+	MOVAPS X0, X1
+	SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
+	ADDSD  X1, X0
+	CMPQ   BX, $0
+	JE     absum_end    // if BX == 0 { goto absum_end }
+
+absum_tail_start: // Reset loop registers
+	MOVQ  BX, CX // Loop counter:  CX = BX
+	XORPS X8, X8 // X_8 = 0
+
+absum_tail: // do {
+	// p_sum += max( p_sum + x[i], p_sum - x[i] )
+	MOVSD (SI)(AX*8), X8 // X_8 = x[i]
+	MOVSD X0, X1         // p_sum_1 = p_sum_0
+	ADDSD X8, X0         // p_sum_0 += X_8
+	SUBSD X8, X1         // p_sum_1 -= X_8
+	MAXSD X1, X0         // p_sum_0 = max( p_sum_0, p_sum_1 )
+	INCQ  AX             // i++
+	LOOP  absum_tail     // } while --CX > 0
+
+absum_end: // return p_sum_0
+	MOVSD X0, sum+24(FP)
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f64/abssuminc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/abssuminc_amd64.s
@@ -0,0 +1,90 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func L1NormInc(x []float64, n, incX int) (sum float64)
+TEXT ·L1NormInc(SB), NOSPLIT, $0
+	MOVQ  x_base+0(FP), SI // SI = &x
+	MOVQ  n+24(FP), CX     // CX = n
+	MOVQ  incX+32(FP), AX  // AX =  increment * sizeof( float64 )
+	SHLQ  $3, AX
+	MOVQ  AX, DX           // DX = AX * 3
+	IMULQ $3, DX
+	PXOR  X0, X0           // p_sum_i = 0
+	PXOR  X1, X1
+	PXOR  X2, X2
+	PXOR  X3, X3
+	PXOR  X4, X4
+	PXOR  X5, X5
+	PXOR  X6, X6
+	PXOR  X7, X7
+	CMPQ  CX, $0           // if CX == 0 { return 0 }
+	JE    absum_end
+	MOVQ  CX, BX
+	ANDQ  $7, BX           // BX = n % 8
+	SHRQ  $3, CX           // CX = floor( n / 8 )
+	JZ    absum_tail_start // if CX == 0 { goto absum_tail_start }
+
+absum_loop: // do {
+	// p_sum = max( p_sum + x[i], p_sum - x[i] )
+	MOVSD  (SI), X8        // X_i[0] = x[i]
+	MOVSD  (SI)(AX*1), X9
+	MOVSD  (SI)(AX*2), X10
+	MOVSD  (SI)(DX*1), X11
+	LEAQ   (SI)(AX*4), SI  // SI = SI + 4
+	MOVHPD (SI), X8        // X_i[1] = x[i+4]
+	MOVHPD (SI)(AX*1), X9
+	MOVHPD (SI)(AX*2), X10
+	MOVHPD (SI)(DX*1), X11
+	ADDPD  X8, X0          // p_sum_i += X_i  ( positive values )
+	ADDPD  X9, X2
+	ADDPD  X10, X4
+	ADDPD  X11, X6
+	SUBPD  X8, X1          // p_sum_(i+1) -= X_i  ( negative values )
+	SUBPD  X9, X3
+	SUBPD  X10, X5
+	SUBPD  X11, X7
+	MAXPD  X1, X0          // p_sum_i = max( p_sum_i, p_sum_(i+1) )
+	MAXPD  X3, X2
+	MAXPD  X5, X4
+	MAXPD  X7, X6
+	MOVAPS X0, X1          // p_sum_(i+1) = p_sum_i
+	MOVAPS X2, X3
+	MOVAPS X4, X5
+	MOVAPS X6, X7
+	LEAQ   (SI)(AX*4), SI  // SI = SI + 4
+	LOOP   absum_loop      // } while --CX > 0
+
+	// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
+	ADDPD X3, X0
+	ADDPD X5, X7
+	ADDPD X7, X0
+
+	// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
+	MOVAPS X0, X1
+	SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
+	ADDSD  X1, X0
+	CMPQ   BX, $0
+	JE     absum_end    // if BX == 0 { goto absum_end }
+
+absum_tail_start: // Reset loop registers
+	MOVQ  BX, CX // Loop counter:  CX = BX
+	XORPS X8, X8 // X_8 = 0
+
+absum_tail: // do {
+	// p_sum += max( p_sum + x[i], p_sum - x[i] )
+	MOVSD (SI), X8   // X_8 = x[i]
+	MOVSD X0, X1     // p_sum_1 = p_sum_0
+	ADDSD X8, X0     // p_sum_0 += X_8
+	SUBSD X8, X1     // p_sum_1 -= X_8
+	MAXSD X1, X0     // p_sum_0 = max( p_sum_0, p_sum_1 )
+	ADDQ  AX, SI     // i++
+	LOOP  absum_tail // } while --CX > 0
+
+absum_end: // return p_sum_0
+	MOVSD X0, sum+40(FP)
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f64/add_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/add_amd64.s
@@ -0,0 +1,66 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func Add(dst, s []float64)
+TEXT ·Add(SB), NOSPLIT, $0
+	MOVQ    dst_base+0(FP), DI // DI = &dst
+	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
+	MOVQ    s_base+24(FP), SI  // SI = &s
+	CMPQ    s_len+32(FP), CX   // CX = max( CX, len(s) )
+	CMOVQLE s_len+32(FP), CX
+	CMPQ    CX, $0             // if CX == 0 { return }
+	JE      add_end
+	XORQ    AX, AX
+	MOVQ    DI, BX
+	ANDQ    $0x0F, BX          // BX = &dst & 15
+	JZ      add_no_trim        // if BX == 0 { goto add_no_trim }
+
+	// Align on 16-bit boundary
+	MOVSD (SI)(AX*8), X0 // X0 = s[i]
+	ADDSD (DI)(AX*8), X0 // X0 += dst[i]
+	MOVSD X0, (DI)(AX*8) // dst[i] = X0
+	INCQ  AX             // i++
+	DECQ  CX             // --CX
+	JE    add_end        // if CX == 0 { return  }
+
+add_no_trim:
+	MOVQ CX, BX
+	ANDQ $7, BX         // BX = len(dst) % 8
+	SHRQ $3, CX         // CX = floor( len(dst) / 8 )
+	JZ   add_tail_start // if CX == 0 { goto add_tail_start }
+
+add_loop: // Loop unrolled 8x   do {
+	MOVUPS (SI)(AX*8), X0   // X_i = s[i:i+1]
+	MOVUPS 16(SI)(AX*8), X1
+	MOVUPS 32(SI)(AX*8), X2
+	MOVUPS 48(SI)(AX*8), X3
+	ADDPD  (DI)(AX*8), X0   // X_i += dst[i:i+1]
+	ADDPD  16(DI)(AX*8), X1
+	ADDPD  32(DI)(AX*8), X2
+	ADDPD  48(DI)(AX*8), X3
+	MOVUPS X0, (DI)(AX*8)   // dst[i:i+1] = X_i
+	MOVUPS X1, 16(DI)(AX*8)
+	MOVUPS X2, 32(DI)(AX*8)
+	MOVUPS X3, 48(DI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	LOOP   add_loop         // } while --CX > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     add_end
+
+add_tail_start: // Reset loop registers
+	MOVQ BX, CX // Loop counter: CX = BX
+
+add_tail: // do {
+	MOVSD (SI)(AX*8), X0 // X0 = s[i]
+	ADDSD (DI)(AX*8), X0 // X0 += dst[i]
+	MOVSD X0, (DI)(AX*8) // dst[i] = X0
+	INCQ  AX             // ++i
+	LOOP  add_tail       // } while --CX > 0
+
+add_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f64/addconst_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/addconst_amd64.s
@@ -0,0 +1,53 @@
+// Copyright ©2016 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+// func Addconst(alpha float64, x []float64)
+TEXT ·AddConst(SB), NOSPLIT, $0
+	MOVQ   x_base+8(FP), SI // SI = &x
+	MOVQ   x_len+16(FP), CX // CX = len(x)
+	CMPQ   CX, $0           // if len(x) == 0 { return }
+	JE     ac_end
+	MOVSD  alpha+0(FP), X4  // X4 = { a, a }
+	SHUFPD $0, X4, X4
+	MOVUPS X4, X5           // X5 = X4
+	XORQ   AX, AX           // i = 0
+	MOVQ   CX, BX
+	ANDQ   $7, BX           // BX = len(x) % 8
+	SHRQ   $3, CX           // CX = floor( len(x) / 8 )
+	JZ     ac_tail_start    // if CX == 0 { goto ac_tail_start }
+
+ac_loop: // Loop unrolled 8x   do {
+	MOVUPS (SI)(AX*8), X0   // X_i = s[i:i+1]
+	MOVUPS 16(SI)(AX*8), X1
+	MOVUPS 32(SI)(AX*8), X2
+	MOVUPS 48(SI)(AX*8), X3
+	ADDPD  X4, X0           // X_i += a
+	ADDPD  X5, X1
+	ADDPD  X4, X2
+	ADDPD  X5, X3
+	MOVUPS X0, (SI)(AX*8)   // s[i:i+1] = X_i
+	MOVUPS X1, 16(SI)(AX*8)
+	MOVUPS X2, 32(SI)(AX*8)
+	MOVUPS X3, 48(SI)(AX*8)
+	ADDQ   $8, AX           // i += 8
+	LOOP   ac_loop          // } while --CX > 0
+	CMPQ   BX, $0           // if BX == 0 { return }
+	JE     ac_end
+
+ac_tail_start: // Reset loop counters
+	MOVQ BX, CX // Loop counter: CX = BX
+
+ac_tail: // do {
+	MOVSD (SI)(AX*8), X0 // X0 = s[i]
+	ADDSD X4, X0         // X0 += a
+	MOVSD X0, (SI)(AX*8) // s[i] = X0
+	INCQ  AX             // ++i
+	LOOP  ac_tail        // } while --CX > 0
+
+ac_end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpy.go
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpy.go
@@ -0,0 +1,62 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:build !amd64 || noasm || gccgo || safe
+// +build !amd64 noasm gccgo safe
+
+package f64
+
+// AxpyUnitary is
+//
+//	for i, v := range x {
+//		y[i] += alpha * v
+//	}
+func AxpyUnitary(alpha float64, x, y []float64) {
+	for i, v := range x {
+		y[i] += alpha * v
+	}
+}
+
+// AxpyUnitaryTo is
+//
+//	for i, v := range x {
+//		dst[i] = alpha*v + y[i]
+//	}
+func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) {
+	for i, v := range x {
+		dst[i] = alpha*v + y[i]
+	}
+}
+
+// AxpyInc is
+//
+//	for i := 0; i < int(n); i++ {
+//		y[iy] += alpha * x[ix]
+//		ix += incX
+//		iy += incY
+//	}
+func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		y[iy] += alpha * x[ix]
+		ix += incX
+		iy += incY
+	}
+}
+
+// AxpyIncTo is
+//
+//	for i := 0; i < int(n); i++ {
+//		dst[idst] = alpha*x[ix] + y[iy]
+//		ix += incX
+//		iy += incY
+//		idst += incDst
+//	}
+func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) {
+	for i := 0; i < int(n); i++ {
+		dst[idst] = alpha*x[ix] + y[iy]
+		ix += incX
+		iy += incY
+		idst += incDst
+	}
+}
--- a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyinc_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyinc_amd64.s
@@ -0,0 +1,142 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define DST_PTR DI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R11
+#define INC_Y R9
+#define INCx3_Y R12
+#define INC_DST R9
+#define INCx3_DST R12
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyInc(SB), NOSPLIT, $0
+	MOVQ x_base+8(FP), X_PTR  // X_PTR = &x
+	MOVQ y_base+32(FP), Y_PTR // Y_PTR = &y
+	MOVQ n+56(FP), LEN        // LEN = n
+	CMPQ LEN, $0              // if LEN == 0 { return }
+	JE   end
+
+	MOVQ ix+80(FP), INC_X
+	MOVQ iy+88(FP), INC_Y
+	LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(x[ix])
+	LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(y[iy])
+	MOVQ Y_PTR, DST_PTR          // DST_PTR = Y_PTR  // Write pointer
+
+	MOVQ incX+64(FP), INC_X // INC_X = incX * sizeof(float64)
+	SHLQ $3, INC_X
+	MOVQ incY+72(FP), INC_Y // INC_Y = incY * sizeof(float64)
+	SHLQ $3, INC_Y
+
+	MOVSD alpha+0(FP), ALPHA // ALPHA = alpha
+	MOVQ  LEN, TAIL
+	ANDQ  $3, TAIL           // TAIL = n % 4
+	SHRQ  $2, LEN            // LEN = floor( n / 4 )
+	JZ    tail_start         // if LEN == 0 { goto tail_start }
+
+	MOVAPS ALPHA, ALPHA_2            // ALPHA_2 = ALPHA  for pipelining
+	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
+	LEAQ   (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
+
+loop:  // do {  // y[i] += alpha * x[i] unrolled 4x.
+	MOVSD (X_PTR), X2            // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MOVSD (X_PTR)(INC_X*2), X4
+	MOVSD (X_PTR)(INCx3_X*1), X5
+
+	MULSD ALPHA, X2   // X_i *= a
+	MULSD ALPHA_2, X3
+	MULSD ALPHA, X4
+	MULSD ALPHA_2, X5
+
+	ADDSD (Y_PTR), X2            // X_i += y[i]
+	ADDSD (Y_PTR)(INC_Y*1), X3
+	ADDSD (Y_PTR)(INC_Y*2), X4
+	ADDSD (Y_PTR)(INCx3_Y*1), X5
+
+	MOVSD X2, (DST_PTR)              // y[i] = X_i
+	MOVSD X3, (DST_PTR)(INC_DST*1)
+	MOVSD X4, (DST_PTR)(INC_DST*2)
+	MOVSD X5, (DST_PTR)(INCx3_DST*1)
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
+	DECQ LEN
+	JNZ  loop                    // } while --LEN > 0
+	CMPQ TAIL, $0                // if TAIL == 0 { return }
+	JE   end
+
+tail_start: // Reset Loop registers
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( LEN / 2 )
+	JZ   tail_one
+
+tail_two:
+	MOVSD (X_PTR), X2              // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MULSD ALPHA, X2                // X_i *= a
+	MULSD ALPHA, X3
+	ADDSD (Y_PTR), X2              // X_i += y[i]
+	ADDSD (Y_PTR)(INC_Y*1), X3
+	MOVSD X2, (DST_PTR)            // y[i] = X_i
+	MOVSD X3, (DST_PTR)(INC_DST*1)
+
+	LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2])
+
+	ANDQ $1, TAIL
+	JZ   end      // if TAIL == 0 { goto end }
+
+tail_one:
+	// y[i] += alpha * x[i] for the last n % 4 iterations.
+	MOVSD (X_PTR), X2   // X2 = x[i]
+	MULSD ALPHA, X2     // X2 *= a
+	ADDSD (Y_PTR), X2   // X2 += y[i]
+	MOVSD X2, (DST_PTR) // y[i] = X2
+
+end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyincto_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyincto_amd64.s
@@ -0,0 +1,148 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define DST_PTR DX
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define INC_X R8
+#define INCx3_X R11
+#define INC_Y R9
+#define INCx3_Y R12
+#define INC_DST R10
+#define INCx3_DST R13
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
+TEXT ·AxpyIncTo(SB), NOSPLIT, $0
+	MOVQ dst_base+0(FP), DST_PTR // DST_PTR := &dst
+	MOVQ x_base+48(FP), X_PTR    // X_PTR := &x
+	MOVQ y_base+72(FP), Y_PTR    // Y_PTR := &y
+	MOVQ n+96(FP), LEN           // LEN := n
+	CMPQ LEN, $0                 // if LEN == 0 { return }
+	JE   end
+
+	MOVQ ix+120(FP), INC_X
+	LEAQ (X_PTR)(INC_X*8), X_PTR       // X_PTR = &(x[ix])
+	MOVQ iy+128(FP), INC_Y
+	LEAQ (Y_PTR)(INC_Y*8), Y_PTR       // Y_PTR = &(dst[idst])
+	MOVQ idst+32(FP), INC_DST
+	LEAQ (DST_PTR)(INC_DST*8), DST_PTR // DST_PTR = &(y[iy])
+
+	MOVQ  incX+104(FP), INC_X    // INC_X = incX * sizeof(float64)
+	SHLQ  $3, INC_X
+	MOVQ  incY+112(FP), INC_Y    // INC_Y = incY * sizeof(float64)
+	SHLQ  $3, INC_Y
+	MOVQ  incDst+24(FP), INC_DST // INC_DST = incDst * sizeof(float64)
+	SHLQ  $3, INC_DST
+	MOVSD alpha+40(FP), ALPHA
+
+	MOVQ LEN, TAIL
+	ANDQ $3, TAIL   // TAIL = n % 4
+	SHRQ $2, LEN    // LEN = floor( n / 4 )
+	JZ   tail_start // if LEN == 0 { goto tail_start }
+
+	MOVSD ALPHA, ALPHA_2                  // ALPHA_2 = ALPHA for pipelining
+	LEAQ  (INC_X)(INC_X*2), INCx3_X       // INCx3_X = INC_X * 3
+	LEAQ  (INC_Y)(INC_Y*2), INCx3_Y       // INCx3_Y = INC_Y * 3
+	LEAQ  (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3
+
+loop:  // do {  // y[i] += alpha * x[i] unrolled 2x.
+	MOVSD (X_PTR), X2            // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MOVSD (X_PTR)(INC_X*2), X4
+	MOVSD (X_PTR)(INCx3_X*1), X5
+
+	MULSD ALPHA, X2   // X_i *= a
+	MULSD ALPHA_2, X3
+	MULSD ALPHA, X4
+	MULSD ALPHA_2, X5
+
+	ADDSD (Y_PTR), X2            // X_i += y[i]
+	ADDSD (Y_PTR)(INC_Y*1), X3
+	ADDSD (Y_PTR)(INC_Y*2), X4
+	ADDSD (Y_PTR)(INCx3_Y*1), X5
+
+	MOVSD X2, (DST_PTR)              // y[i] = X_i
+	MOVSD X3, (DST_PTR)(INC_DST*1)
+	MOVSD X4, (DST_PTR)(INC_DST*2)
+	MOVSD X5, (DST_PTR)(INCx3_DST*1)
+
+	LEAQ (X_PTR)(INC_X*4), X_PTR       // X_PTR = &(X_PTR[incX*4])
+	LEAQ (Y_PTR)(INC_Y*4), Y_PTR       // Y_PTR = &(Y_PTR[incY*4])
+	LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4]
+	DECQ LEN
+	JNZ  loop                          // } while --LEN > 0
+	CMPQ TAIL, $0                      // if TAIL == 0 { return }
+	JE   end
+
+tail_start: // Reset Loop registers
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( LEN / 2 )
+	JZ   tail_one
+
+tail_two:
+	MOVSD (X_PTR), X2              // X_i = x[i]
+	MOVSD (X_PTR)(INC_X*1), X3
+	MULSD ALPHA, X2                // X_i *= a
+	MULSD ALPHA, X3
+	ADDSD (Y_PTR), X2              // X_i += y[i]
+	ADDSD (Y_PTR)(INC_Y*1), X3
+	MOVSD X2, (DST_PTR)            // y[i] = X_i
+	MOVSD X3, (DST_PTR)(INC_DST*1)
+
+	LEAQ (X_PTR)(INC_X*2), X_PTR       // X_PTR = &(X_PTR[incX*2])
+	LEAQ (Y_PTR)(INC_Y*2), Y_PTR       // Y_PTR = &(Y_PTR[incY*2])
+	LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incY*2]
+
+	ANDQ $1, TAIL
+	JZ   end      // if TAIL == 0 { goto end }
+
+tail_one:
+	MOVSD (X_PTR), X2   // X2 = x[i]
+	MULSD ALPHA, X2     // X2 *= a
+	ADDSD (Y_PTR), X2   // X2 += y[i]
+	MOVSD X2, (DST_PTR) // y[i] = X2
+
+end:
+	RET
--- a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitary_amd64.s
+++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitary_amd64.s
@@ -0,0 +1,134 @@
+// Copyright ©2015 The Gonum Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+//
+// Some of the loop unrolling code is copied from:
+// http://golang.org/src/math/big/arith_amd64.s
+// which is distributed under these terms:
+//
+// Copyright (c) 2012 The Go Authors. All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//    * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//    * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//    * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// +build !noasm,!gccgo,!safe
+
+#include "textflag.h"
+
+#define X_PTR SI
+#define Y_PTR DI
+#define DST_PTR DI
+#define IDX AX
+#define LEN CX
+#define TAIL BX
+#define ALPHA X0
+#define ALPHA_2 X1
+
+// func AxpyUnitary(alpha float64, x, y []float64)
+TEXT ·AxpyUnitary(SB), NOSPLIT, $0
+	MOVQ    x_base+8(FP), X_PTR  // X_PTR := &x
+	MOVQ    y_base+32(FP), Y_PTR // Y_PTR := &y
+	MOVQ    x_len+16(FP), LEN    // LEN = min( len(x), len(y) )
+	CMPQ    y_len+40(FP), LEN
+	CMOVQLE y_len+40(FP), LEN
+	CMPQ    LEN, $0              // if LEN == 0 { return }
+	JE      end
+	XORQ    IDX, IDX
+	MOVSD   alpha+0(FP), ALPHA   // ALPHA := { alpha, alpha }
+	SHUFPD  $0, ALPHA, ALPHA
+	MOVUPS  ALPHA, ALPHA_2       // ALPHA_2 := ALPHA   for pipelining
+	MOVQ    Y_PTR, TAIL          // Check memory alignment
+	ANDQ    $15, TAIL            // TAIL = &y % 16
+	JZ      no_trim              // if TAIL == 0 { goto no_trim }
+
+	// Align on 16-byte boundary
+	MOVSD (X_PTR), X2   // X2 := x[0]
+	MULSD ALPHA, X2     // X2 *= a
+	ADDSD (Y_PTR), X2   // X2 += y[0]
+	MOVSD X2, (DST_PTR) // y[0] = X2
+	INCQ  IDX           // i++
+	DECQ  LEN           // LEN--
+	JZ    end           // if LEN == 0 { return }
+
+no_trim:
+	MOVQ LEN, TAIL
+	ANDQ $7, TAIL   // TAIL := n % 8
+	SHRQ $3, LEN    // LEN = floor( n / 8 )
+	JZ   tail_start // if LEN == 0 { goto tail2_start }
+
+loop:  // do {
+	// y[i] += alpha * x[i] unrolled 8x.
+	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
+	MOVUPS 16(X_PTR)(IDX*8), X3
+	MOVUPS 32(X_PTR)(IDX*8), X4
+	MOVUPS 48(X_PTR)(IDX*8), X5
+
+	MULPD ALPHA, X2   // X_i *= a
+	MULPD ALPHA_2, X3
+	MULPD ALPHA, X4
+	MULPD ALPHA_2, X5
+
+	ADDPD (Y_PTR)(IDX*8), X2   // X_i += y[i]
+	ADDPD 16(Y_PTR)(IDX*8), X3
+	ADDPD 32(Y_PTR)(IDX*8), X4
+	ADDPD 48(Y_PTR)(IDX*8), X5
+
+	MOVUPS X2, (DST_PTR)(IDX*8)   // y[i] = X_i
+	MOVUPS X3, 16(DST_PTR)(IDX*8)
+	MOVUPS X4, 32(DST_PTR)(IDX*8)
+	MOVUPS X5, 48(DST_PTR)(IDX*8)
+
+	ADDQ $8, IDX  // i += 8
+	DECQ LEN
+	JNZ  loop     // } while --LEN > 0
+	CMPQ TAIL, $0 // if TAIL == 0 { return }
+	JE   end
+
+tail_start: // Reset loop registers
+	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
+	SHRQ $1, LEN   // LEN = floor( TAIL / 2 )
+	JZ   tail_one  // if TAIL == 0 { goto tail }
+
+tail_two: // do {
+	MOVUPS (X_PTR)(IDX*8), X2   // X2 = x[i]
+	MULPD  ALPHA, X2            // X2 *= a
+	ADDPD  (Y_PTR)(IDX*8), X2   // X2 += y[i]
+	MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X2
+	ADDQ   $2, IDX              // i += 2
+	DECQ   LEN
+	JNZ    tail_two             // } while --LEN > 0
+
+	ANDQ $1, TAIL
+	JZ   end      // if TAIL == 0 { goto end }
+
+tail_one:
+	MOVSD (X_PTR)(IDX*8), X2   // X2 = x[i]
+	MULSD ALPHA, X2            // X2 *= a
+	ADDSD (Y_PTR)(IDX*8), X2   // X2 += y[i]
+	MOVSD X2, (DST_PTR)(IDX*8) // y[i] = X2
+
+end:
+	RET
--- a/Show More
+++ b/Show More