NumPy 基础使用

8 minute read

Published: August 15, 2019

下面介绍关于 NumPy 的基本函数使用。

在机器学习中不要使用一维数组，请使用二维矩阵！

请注意函数返回的是否是一维数组

请注意浅拷贝与深拷贝的区别

请注意设置 dtype

数据类型转换请使用 astype()，不要直接修改 dtype

numpy 中 array 和 asarray 的区别

array 和 asarray 都可以将结构数据转化为 ndarray，但是主要区别就是当数据源是 ndarray 时，array 仍然会 copy 出一个副本，占用新的内存，但 asarray 不会，返回的是视图。

创建数组

不要创建一维数组，请创建二维矩阵

import numpy as np

# 创建数组
a = np.array([[1, 2, 3], [4, 5, 6]])        # int64
b = np.array([[1.0, 2.0, 3.0], [4, 5, 6]])  # float64

# 设置 dtype: np.int32, np.float32, np.complex
# 注意：由于 torch 默认创建 int 是 64 位的，float 是 32 位的，也要指定 dtype。
a = np.array([1, 2, 3], dtype=np.int64)     # int64
b = np.array([1, 2, 3], dtype=np.float32)   # float32
c = np.array([1, 2, 3], dtype=np.complex)   # complex

# a = np.array(1, 2, 3)  # wrong


# 下面两种的不同
a = np.array([[1, 2, 3]])
print(a.shape)  # (1, 3)
a = np.array([1, 2, 3])
print(a.shape)  # (3,)


# 下面两种的不同
a = np.random.randn(3, 1)
print(a.shape)  # (3, 1)
a = np.random.randn(3)
print(a.shape)  # (3,)


# 创建全零数组 类型都是float64
a = np.zeros((4, 3))  # 数据全为 0，4 行 3 列


# 创建全 1 数组
b = np.ones((4, 3))  # 数据全为 0，4 行 3 列


# 用 arange 创建连续数组:
a = np.arange(1, 11, 2)  # 1~10 的数据，2 步长，[1, 3, 5, 7, 9]


# 用 linspace 创建线段型数据:
a = np.linspace(0, np.pi, 3)  # [0., 1.57079633, 3.14159265]

创建随机数组

# 标准正态分布
a = np.random.randn(2,3)
a = np.random.standard_normal(size=(2,3))

# 正态分布：均值为 loc，标准差为 scale
a = np.random.normal(loc=0.0, scale=1.0, size=(2,3))

# 均匀分布：[0,1) 之间的的随机浮点数
a = np.random.rand(2,3)
a = np.random.ranf(size=(2,3))
a = np.random.random(size=(2,3))
a = np.random.random_sample(size=(2,3))

# 均匀分布：[low, high) 之间的的随机浮点数
np.random.uniform(low=2, high=6, size=(2,3))#指定均匀分布

# 均匀分布：[low, high) 之间的随机整数
a = np.random.randint(low=2, high=6, size=(2,3))

# 均匀分布：[low, high] 之间的随机整数
a = np.random.random_integers(low=2, high=6, size=(2,3))

# 均匀分布：从给定的列表中随机选择
# a : 1-D array-like or int
# replace : Whether the sample is with or without replacement
# p : 1-D array-like, The probabilities associated with each entry in a.
np.random.choice(a=[1, 2, 3], size=(2,3), replace=True, p=[0.1,0.2,0.7])

# 随机打乱数组（in-place）
# 如果 a 为多维数组，只沿第一条轴打乱。
a = [1, 2, 3, 4, 5]
b = np.random.shuffle(a)
print(a)        # [3, 1, 5, 4, 2]
print(type(a))  # <class 'list'>
print(b)        # None

# 随机打乱数组（not in-place）
# 如果 a 为多维数组，只沿第一条轴打乱。
a = [1, 2, 3, 4, 5]
b = np.random.permutation(a)
print(a)        # [3, 1, 5, 4, 2]
print(type(a))  # <class 'list'>
print(b)        # [3 4 2 5 1]
print(type(b))  # <class 'numpy.ndarray'>

创建网格数据

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0,1000,20)
y = np.linspace(0,500,20)

X, Y = np.meshgrid(x, y)

plt.plot(X, Y,
         color='limegreen',  # 设置颜色为limegreen
         marker='.',         # 设置点类型为圆点
         linestyle='')       # 设置线型为空，也即没有线连接点
plt.grid(True)
plt.show()

array 常用的属性

# 维度
print('number of dim:', a.ndim)

# 行数和列数 (list对象没有shape这个属性)
print('shape :', a.shape)
[row, col] = a.shape

# 对应维度的长度
print('shape :', a.shape[n])

# 元素个数
print('size:', a.size)

# 数据类型
print('dtype:', a.dtype)

数据类型转换

numpy 中的数据类型转换，只能用函数 astype()，不能直接改原数据的 dtype，否则会改变数组维度！。

a = np.array([1, 2, 3], dtype=np.int32)  # int32
b = a.astype(np.float32)                 # float32

维度改变

将 1 维转换成 2 维

print(a[np.newaxis,:])        # [[1 1 1]] 

print(a[np.newaxis,:].shape)  # (1, 3)

print(a[:,np.newaxis])        # [[1], [1], [1]]

print(a[:,np.newaxis].shape)  # (3, 1)

改变形状

# reshape 改变数据的形状 但元素的 size 是不变的
a = np.arange(12).reshape(3, 4)    # 0~11 的数据，3 行 4 列
# a = np.arange(12).reshape(2, 4)  # wrong

矩阵拉平

a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.array([[1], [2], [3]])


# np.flatten() 返回一份拷贝，对拷贝所做修改不会影响原始矩阵
print(a.flatten())  # [1, 2, 3, 4, 5, 6]
print(b.flatten())  # [1, 2, 3]


# np.ravel() 返回的是视图，修改时会影响原始矩阵 
print(a.ravel())    # [1, 2, 3, 4, 5, 6]
print(b.flatten())  # [1, 2, 3]


# np.squeeze() 只能对维数为1的维度降维, 返回的是视图，修改时会影响原始矩阵 
print(a.squeeze())  # [[1, 2, 3], [4, 5, 6]]
print(b.squeeze())  # [1, 2, 3]


# np.reshape(-1) 返回的是视图，修改时会影响原始矩阵 
print(a.reshape(-1))  # [[1, 2, 3, 4, 5, 6]]
print(b.reshape(-1))  # [1, 2, 3]

基本运算

numpy 的数组类被称为 ndarray，ndarray 的基本运算大都是对应位置的两个元素之间进行运算，除了 dot （矩阵乘法）

a = np.array([10, 20, 30])

b = np.arange(1, 4)

c = a + b  # [11, 22, 33]

d = a - b  # [9, 18, 27]

e = a * b  # [10, 40, 90]

f = np.dot(a, b)  # 另一种写法 f = a.dot(b) # 140

g = b**2   # [1, 4, 9]

broadcasting

矩阵转置

a = np.array([[1, 2, 3], [4, 5, 6]])

print(a.T)
print(np.transpose(a))


# 但是对 1 维数组来说，这样求得的转置是不对的，得先 reshape 成二维数组
b = np.array([1, 1, 1])
print(b.shape)    # (3,)
print(b.T.shape)  # (3,)
print(np.transpose(b).shape)  # (3,)

c = b[:, np.newaxis]
print(c.shape)    # (3, 1)
print(c.T.shape)  # (1, 3)
print(np.transpose(c).shape)  # (1, 3)

舍入函数

np.around()  # 函数返回指定数字的四舍五入值。

np.floor()   # 返回数字的下舍整数。

np.ceil()    # 返回数字的上入整数。

`*`、`np.dot()`、`np.multiply()` 区别

	`list`	`np.array`	`np.mat()`
`*`	error	点乘	叉乘
`np.dot()`	一维内积，其他叉乘	一维内积，其他叉乘	叉乘
`np.multiply()`	点乘	点乘	点乘

参数 axis

axis 用来指明将要进行的运算是沿着哪个轴执行，通常 numpy 里面的一些降维操作(aggregate functions)需要我们指定对应的维度，比如 sum 函数表示对哪个维度求和，max 表示对哪个维度求最大值。通常当我们在这些函数里面指定了 axis=n 时，那么函数输出的数组当中，原来的第 n 维就被消除了，沿着该轴的方向走去执行操作。

求最值

求矩阵和/最小值/最大值，axis 指定维度，不指定默认整个矩阵

a = np.array([[1, 2, 3], [4, 5, 6]]) 

print(a.sum())        # 21

print(a.sum(axis=0))  # [5, 7, 9]

print(a.sum(axis=1))  # [6, 15]

print(a.min())        # 1

print(a.min(axis=0))  # [1, 2, 3]

print(a.min(axis=1))  # [1, 4]

print(a.max())        # 6

print(a.max(axis=0))  # [4, 5, 6]

print(a.max(axis=1))  # [3, 6]

索引

在索引的时候不要写成 [][] 的形式，要使用 [,]

a = np.arange(12).reshape(3,4)
"""
[[ 0,  1,  2,  3],
 [ 4,  5,  6,  7],
 [ 8,  9, 10, 11]]
"""


# 求最小/最大元素的索引，索引的是平铺数组，可输入参数 axis
print(np.argmin(a))  # 0
print(np.argmax(a))  # 11
print(np.argmin(a, axis=0))  # [0 0 0 0]
print(np.argmax(a, axis=1))  # [3 3 3]


# 获取多维数组最值索引
index = np.unravel_index(a.argmax(), a.shape)  # (2, 3)


# 一维索引  
print(a[0])  # [0 1 2 3]


# 二维索引  
print(a[0][1])      # 1
print(a[0, 1])      # 1
print(a[1, 1:3])    # [5 6]
print(a[1, :])      # [4 5 6 7]
print(a[:, -1])     # [3 7 11]
print(a[:, 1:3])    # [[1  2], [5  6], [9 10]]
print(a[:, 1:3:2])  # [[1], [5], [9]]，列从索引 1 开始到索引 3 停止，间隔为 2
print(a[:, [1, 2]])  # [[1  2], [5  6], [9 10]]
print(a[[0, 1], [1, 2]])  # [1, 6]


# 布尔索引
print(a[a > 5])  # [ 6,  7,  8,  9, 10, 11]


# 数组遍历
[row, col] = a.shape
for i in range(row):
	for j in range(col):
  	print(a[i, j])

    
numpy.where()  # 函数返回输入数组中满足给定条件的元素的索引。

拼接

1 维数组拼接

建议使用 np.vstack()、np.hstack()、np.stack()

a = np.array([1, 1, 1])
b = np.array([2, 2, 2])


# 对比 np.vstack() 和 np.hstack()
print(np.vstack((a, b)))
"""
[[1 1 1]
 [2 2 2]]
"""

print(np.hstack((a, b)))  # [1 1 1 2 2 2]


# 对比 np.concatenate()
print(np.concatenate((a, b), axis=0))  # [1 1 1 2 2 2]

print(np.concatenate((a, b), axis=1))  # wrong


# 对比 np.append()
print(np.append(a, b, axis=0))  # [1 1 1 2 2 2]

print(np.append(a, b, axis=1))  # wrong


# 对比 np.r_() 和 np.c_()
print(np.r_[a, b])  # [1 1 1 2 2 2]

print(np.c_[a, b])	
"""
[[1 2]
 [1 2]
 [1 2]]
"""


# 对比 np.stack()
print(np.stack((a, b)))
"""
[[1 1 1]
 [2 2 2]]
"""
print(np.stack((a, b)).shape)  # (2, 3)

2 维矩阵拼接

建议使用 np.vstack()、np.hstack()、np.stack()

a = np.array([[1, 1, 1]])
b = np.array([[2, 2, 2]])


# 对比 np.vstack() 和 np.hstack()
print(np.vstack((a, b)))
"""
[[1 1 1]
 [2 2 2]]
"""

print(np.hstack((a, b)))  # [[1 1 1 2 2 2]]


# 对比 np.concatenate()
print(np.concatenate((a, b), axis=0))
"""
[[1 1 1]
 [2 2 2]]
"""

print(np.concatenate((a, b),axis=1))  # [[1 1 1 2 2 2]]


# 对比 np.append()
print(np.append(a, b, axis=0))
"""
[[1 1 1]
 [2 2 2]]
"""

print(np.append(a, b, axis=1))  # [[1 1 1 2 2 2]]


# 对比 np.r_() 和 np.c_()
print(np.r_[a, b])
"""
[[1 1 1]
 [2 2 2]]
"""

print(np.c_[a, b])  # [[1 1 1 2 2 2]]


# 对比 np.stack()
print(np.stack((a, b)))
"""
[[[1 1 1]]
 [[2 2 2]]]
"""
print(np.stack((a, b)).shape)  # (2, 1, 3)

矩阵分割

a = np.arange(12).reshape(3,4)

# 矩阵等量分割
print(np.split(a, 2, axis=1))  # 纵向分割
"""
[array([[0, 1],
       [4, 5],
       [8, 9]]), 
 array([[ 2,  3],
       [ 6,  7],
       [10, 11]])]
"""

print(np.split(a, 3, axis=0))  # 横向分割
"""
[array([[0, 1, 2, 3]]),
 array([[4, 5, 6, 7]]), 
 array([[ 8,  9, 10, 11]])]
"""

# print(np.split(a, 3, axis=1))	# wrong，矩阵只有 4 列，企图在纵向分割成 3 份，无法等量分割


# 矩阵不等量分割
print(np.array_split(a, 3, axis=1))
"""
[array([[0, 1],
       [4, 5],
       [8, 9]]),
 array([[ 2],
       [ 6],
       [10]]),
 array([[ 3],
       [ 7],
       [11]])]
"""

NumPy IO

a = np.arange(0, 10)
np.savetxt("out.txt", a, fmt="%s", delimiter=",")  # 改为保存为字符串，以逗号分隔
b = np.loadtxt("out.txt", delimiter=",")  # load 时也要指定为逗号分隔

占用内存大小

import numpy as np
import sys

a = np.array([1, 1, 1],dtype=np.int64)
b = np.array([],dtype=np.int64)

print(a.nbytes)          # 24
print(sys.getsizeof(a))  # 128

print(b.nbytes)          # 0
print(sys.getsizeof(b))  # 104