Griffin Chow / NumPy学习笔记

Created Wed, 10 Apr 2024 00:00:00 +0000 Modified Thu, 06 Nov 2025 04:24:59 +0000
3199 Words

1 安装NumPy

1.1 pip安装

python --version        # 核对 Python
pip --version           # 核对 pip
pip install numpy       # 最新版
pip install numpy==1.21.0   # 指定版本
# 国内加速
pip install numpy -i https://pypi.tuna.tsinghua.edu.cn/simple

常见:权限用 sudo 或管理员;未找到命令→检查 PATH。

1.2 conda安装

conda --version
conda create -n myenv python=3.11
conda activate myenv
conda install numpy

1.3 常见问题

  • 找不到 pip / conda:添加到环境变量。
  • 权限错误:用管理员或 sudo.
  • 超时:换镜像或检查网络。
  • 版本冲突:先 pip list / conda list 再升级。

1.4 验证

import numpy as np
print(np.__version__)
print(np.array([1,2,3]))

2 ndarray 概述

2.1 特性

同质数据、连续内存、高效向量化、广播自动扩展维度。

2.2 创建

import numpy as np
a1 = np.array([1,2,3,4])
a2 = np.array([[1,2,3],[4,5,6]])
ar = np.arange(0,10,2)            # 步长
ls = np.linspace(0,1,5)           # 等距
z  = np.zeros((2,3))
o  = np.ones((2,3))
e  = np.empty((2,3))              # 未初始化
r1 = np.random.rand(5)            # [0,1)均匀
r2 = np.random.randint(0,10,(2,3))

2.3 基本操作

arr = np.array([[1,2,3],[4,5,6]])
print(arr[0,1])        # 访问
arr[0,1] = 10          # 修改
print(arr[0])          # 行
print(arr[:,1])        # 列
sub = arr[:2,:2]       # 切片视图
step = np.array([[1,2,3,4,5],[6,7,8,9,10]])[0,::2]  # 步长
mask = arr > 5
print(arr[mask])       # 布尔索引
idx  = np.array([0,1])
print(arr[idx,[1,2]])  # 花式索引

2.4 与列表对比

  • 速度:向量化比列表推导快。
  • 内存:连续块 vs 指针数组。
  • 类型:数组单一 dtype;列表可混合。
# 性能简对比(示意)
import time
N=1_000_00
py = list(range(N))
t=time.time(); py = [x*2 for x in py]; print("list:", time.time()-t)
np_a = np.arange(N)
t=time.time(); np_a = np_a*2; print("numpy:", time.time()-t)

3 数据类型 (dtype)

3.1 常见类型

np.int8/16/32/64, np.float32/64, np.complex128, np.bool_, np.str_.

i32 = np.array([1,2,3], dtype=np.int32)
f64 = np.array([1.,2.5,3.7], dtype=np.float64)
c128= np.array([1+2j,3+4j], dtype=np.complex128)
b   = np.array([True,False], dtype=np.bool_)
s   = np.array(['hello','world'], dtype=np.str_)

3.2 类型转换

arr  = np.array([1.5,2.7,3.9])
print(arr.astype(np.int32))      # 截断
print(np.array([1,2,3]).astype(np.float64))
print(np.array([True,False]).astype(int))
print(np.array(['1.1','2.2']).astype(float))
try:
    np.array(['1','two']).astype(float)
except ValueError as e:
    print("错误:", e)

4 数组属性

arr = np.array([[1,2,3],[4,5,6]])
print(arr.shape, arr.ndim, arr.size, arr.dtype, arr.itemsize, arr.nbytes)
r   = np.arange(12).reshape(3,4)       # 重塑
col = np.array([1,2,3])[:,np.newaxis]  # 增维

5 创建方式补充

orig = np.array([1,2,3,4,5])
new  = np.array(orig)          # 复制
rng  = np.arange(0,10,2)
lin  = np.linspace(0,1,4)
np.random.seed(42)
print(np.random.rand(2,3))
print(np.random.randn(3,2))
print(np.random.randint(0,10,(2,3)))

6 切片与索引

one = np.array([10,20,30,40])
print(one[0], one[-1])

two = np.array([[1,2,3],[4,5,6]])
print(two[0,1], two[1,2])

three = np.array([[[1,2],[3,4]], [[5,6],[7,8]]])
print(three[0,1,0])          # 多维

data = np.array([10,20,30,40,50])
print(data[data>25])         # 布尔
print(data[[0,2,4]])         # 花式

7 高级索引与视图

base = np.array([[1,2,3],[4,5,6],[7,8,9]])
sub  = base[1:3,0:2]         # 视图
sub[0,0] = 99                # 修改同步到 base
print(base)

even = base[base%2==0]
rows = base[[0,2]]
gt5  = base[base>5]
print(even, rows, gt5)

8 数组运算与广播

8.1 基本运算

a = np.array([1,2,3]); b = np.array([4,5,6])
print(a+b, b-a, a*b, b/a, a+10)

8.2 广播

A = np.array([[1,2,3],[4,5,6]])
B = np.array([10,20,30])     # 广播到 (2,3)
print(A + B)
C = np.array([[1],[2],[3]])
A2= np.array([[1,2,3],[4,5,6],[7,8,9]])
print(A2 + C)

8.3 聚合

print(np.sum(A), np.mean(A), np.min(A), np.max(A))
print(np.sum(A,axis=0), np.sum(A,axis=1))   # 列 & 行

9 数组操作

9.1 拼接

x = np.array([[1,2,3],[4,5,6]])
y = np.array([[7,8,9]])
print(np.vstack((x,y)))
z = np.array([[10,11,12],[13,14,15]])
print(np.hstack((x,z)))
print(np.dstack((x,x)))
print(np.concatenate((x,y), axis=0))

9.2 重塑

r = np.arange(12).reshape(4,3)

9.3 复制与视图

orig = np.array([1,2,3,4,5])
cp   = orig.copy()
vw   = orig[1:4]       # 视图
vw[0]= 99
print(orig, cp, vw)

10 数学函数与示例

10.1 常用

import numpy as np
ang = np.array([0, np.pi/2, np.pi])
print(np.sin(ang), np.cos(ang))
print(np.exp([0,1,2]))
print(np.log([1,np.e,10]))
print(np.log10([1,10,100]))

10.2 综合示例(代码关键注释)

import numpy as np
# 生成等间隔角度
x = np.linspace(0, 2*np.pi, 100)
y_sin = np.sin(x)        # 正弦
y_cos = np.cos(x)        # 余弦
y_exp = np.exp(x)        # 指数
y_log = np.log(x + 1e-10)# 对数(避开0)
{
  "title": {"text": "三角与指数/对数函数曲线"},
  "tooltip": {"trigger": "axis"},
  "legend": {"data": ["sin(x)", "cos(x)", "exp(x)", "log(x)"]},
  "xAxis": {"type": "category", "data": Array.from({length:100}, (_,i)=> (2*Math.PI*i/99).toFixed(2))},
  "yAxis": {"type": "value"},
  "series": [
    {"type": "line", "name": "sin(x)", "data": Array.from({length:100}, (_,i)=> Math.sin(2*Math.PI*i/99))},
    {"type": "line", "name": "cos(x)", "data": Array.from({length:100}, (_,i)=> Math.cos(2*Math.PI*i/99))},
    {"type": "line", "name": "exp(x)", "data": Array.from({length:100}, (_,i)=> Math.exp(2*Math.PI*i/99))},
    {"type": "line", "name": "log(x)", "data": Array.from({length:100}, (_,i)=> Math.log(2*Math.PI*i/99 + 1e-10))}
  ]
}
# 角度列表(部分角度演示)
angles = np.array([0, np.pi/6, np.pi/4, np.pi/3, np.pi/2])
sin_vals = np.sin(angles)
cos_vals = np.cos(angles)
{
  "title": {"text": "角度与正弦余弦值"},
  "tooltip": {},
  "xAxis": {"type": "category", "data": ["0","π/6","π/4","π/3","π/2"]},
  "yAxis": {"type": "value"},
  "legend": {"data": ["sin","cos"]},
  "series": [
    {"type": "bar", "name": "sin", "data": [0, 0.5, 0.7071, 0.8660, 1]},
    {"type": "bar", "name": "cos", "data": [1, 0.8660, 0.7071, 0.5, 0]}
  ]
}

10.3 练习

angles = np.array([0,np.pi/6,np.pi/4,np.pi/3,np.pi/2])
print(np.sin(angles), np.cos(angles))
x = np.array([1,2,3,4,5])
print(np.exp(x), np.log(x))

11 统计函数

11.1 概述

NumPy 提供描述性统计函数快速衡量数据集中趋势与离散程度:np.meannp.mediannp.stdnp.var 等。可指定 axis 做按行或按列统计。

11.2 描述性统计示例

import numpy as np
data = np.array([1,2,3,4,5])
print(np.mean(data))    # 均值 3.0
print(np.median(data))  # 中位数 3.0
print(np.std(data))     # 标准差 (总体) 1.4142...
print(np.var(data))     # 方差 2.0

注意:np.std / np.var 默认使用总体公式;样本标准差需传 ddof=1

11.3 综合分析 + 直方图

import numpy as np
import matplotlib.pyplot as plt

data = np.random.normal(loc=0, scale=1, size=1000)
mean_v, med_v, std_v, var_v = np.mean(data), np.median(data), np.std(data), np.var(data)
print(mean_v, med_v, std_v, var_v)

plt.hist(data, bins=30, alpha=0.7, edgecolor='black')
plt.axvline(mean_v, color='red', linestyle='--', label='Mean')
plt.axvline(med_v, color='gold', linestyle='--', label='Median')
plt.legend(); plt.title("Normal Distribution Sample"); plt.show()
{
  "title": {"text": "正态分布样本统计"},
  "tooltip": {"trigger": "axis"},
  "xAxis": {"type": "category", "data": ["mean","median","std","var"]},
  "yAxis": {"type": "value"},
  "series": [
    {"type": "bar", "name": "value", "data": [0,0,1,1]}
  ]
}

(示意:真实值请运行代码后替换)

11.4 练习

data = np.array([10,20,30,40,50,60,70,80,90,100])
print({ "mean": np.mean(data),
        "median": np.median(data),
        "std": np.std(data),
        "var": np.var(data) })

12 排序与条件选择

12.1 排序

arr = np.array([3,1,2,5,4])
print(np.sort(arr))  # 返回新数组
arr.sort()           # 就地排序
print(arr)
idx = np.argsort(arr)        # 排序后位置索引
print(arr[idx])              # 通过索引重排

12.2 条件选择

arr = np.array([10,20,30,40,50])
cond = arr > 30
print(np.where(cond))        # (array([3,4]),)
print(arr[cond])             # [40 50]

nz = np.nonzero([0,1,2,0,3])
print(nz)                    # (array([1,2,4]),)

12.3 练习

arr = np.array([5,3,8,1,4])
print(np.sort(arr))
print(np.argsort(arr))
print(arr[arr>3])
print(np.nonzero(arr))

13 字符串函数

13.1 概述

np.char 命名空间提供向量化字符串操作:add, split, find, replace, lower, upper 等;适合批量处理而非逐元素循环。

13.2 主要示例

import numpy as np
a1 = np.array(['Hello','World'])
a2 = np.array([' NumPy',' Tutorial'])
print(np.char.add(a1,a2))              # 连接

sents = np.array(['Hello NumPy Tutorial','Welcome to Python'])
print(np.char.split(sents))            # 分割 -> 列表对象

tokens = np.array(['Hello','World','NumPy'])
print(np.char.find(tokens,'o'))        # 查找字符 'o' 的位置,未找到 -1

rep = np.array(['Hello World','NumPy is great'])
print(np.char.replace(rep,'World','NumPy'))

13.3 练习

strings = np.array(['apple','banana','cherry'])
extra = np.array([' and orange',' and grape',' and kiwi'])
print(np.char.add(strings, extra))
print(np.char.find(strings,'a'))      # 每个字符串首次出现 'a' 位置
print(np.char.upper(strings))

14 线性代数

14.1 基础操作

import numpy as np
A = np.array([[1,2],[3,4]])
B = np.array([[5,6],[7,8]])
print(A @ B)          # 矩阵乘法
print(A.T)            # 转置
print(np.linalg.det(A))
print(np.linalg.inv(A))   # 逆 (det != 0)

14.2 特征值与特征向量

M = np.array([[2,1],[1,2]])
vals, vecs = np.linalg.eig(M)
print(vals)           # 特征值
print(vecs)           # 列向量为对应特征向量

14.3 练习

A = np.array([[2,3],[5,4]])
B = np.array([[1,2],[3,2]])
print(A @ B)
print(A.T)
print(np.linalg.inv(A))
C = np.array([[3,1],[1,3]])
print(np.linalg.eig(C))

15 文件输入输出

15.1 读取

# data.txt 内容:以空格分隔数值
arr = np.loadtxt('data.txt')               # 简单规则文本
arr_nan = np.genfromtxt('data_with_nan.txt', filling_values=0)  # 缺失值处理

15.2 保存

data = np.array([[1,2,3],[4,5,6],[7,8,9]])
np.savetxt('out.csv', data, delimiter=',', fmt='%d')
np.save('data.npy', data)
loaded = np.load('data.npy')

15.3 练习

d = np.arange(9).reshape(3,3)
np.savetxt('m.txt', d, fmt='%d')
np.save('m.npy', d)
print(np.loadtxt('m.txt'))
print(np.load('m.npy'))

16 与其他库结合

16.1 NumPy ↔ Pandas

import numpy as np, pandas as pd
arr = np.array([[1,2,3],[4,5,6]])
df = pd.DataFrame(arr, columns=['A','B','C'])
print(df.to_numpy())            # 取底层 ndarray
print(np.mean(df, axis=0))      # 利用 NumPy 算列均值

16.2 NumPy ↔ Matplotlib

import numpy as np, matplotlib.pyplot as plt
x = np.linspace(0,10,200)
plt.plot(x, np.sin(x), label='sin'); plt.plot(x, np.cos(x), label='cos')
plt.legend(); plt.grid(); plt.show()
{
  "title": {"text": "sin / cos 示意"},
  "tooltip": {"trigger": "axis"},
  "xAxis": {"type": "category", "data": ["0","π/2","π","3π/2","2π"]},
  "yAxis": {"type": "value"},
  "legend": {"data": ["sin","cos"]},
  "series": [
    {"type": "line","name":"sin","data":[0,1,0,-1,0]},
    {"type": "line","name":"cos","data":[1,0,-1,0,1]}
  ]
}

16.3 NumPy ↔ SciPy

from scipy.linalg import inv
A = np.array([[1,2],[3,4]])
print(inv(A))                   # 更丰富线性代数工具

from scipy.optimize import minimize
def f(x): return x**2 + 3*x + 2
res = minimize(f, 0)
print(res.x)                   # 最优点

from scipy.interpolate import interp1d
x = np.array([1,2,3,4]); y = x**2
f = interp1d(x,y,kind='linear')
x_new = np.linspace(1,4,8)
print(f(x_new))

16.4 练习

import numpy as np, pandas as pd, matplotlib.pyplot as plt
from scipy import stats

df = pd.DataFrame(np.array([[1,2,3],[4,5,6]]), columns=['A','B','C'])
print(df)
x = np.linspace(0,10,100); plt.plot(x,np.cos(x)); plt.show()

# 正态分布拟合
samples = np.random.normal(0,1,1000)
count,bins,_ = plt.hist(samples,30,density=True)
pdf = stats.norm.pdf(bins,0,1)
plt.plot(bins,pdf,'r-'); plt.show()

17 NumPy的性能优化

17.1 性能技巧概述

  • 使用矢量化 (避免 Python for 循环):底层 C/Fortran 实现。
  • 利用广播:无需显式扩展数组。
  • 合理 dtype:精度足够时用 float32 减少内存与缓存压力。
  • 内置函数 (np.sum, np.mean, np.dot) 优于手写循环。
  • 避免频繁创建临时对象:链式操作可分步减少峰值内存。
  • 使用 inplace 替换(可行时):如 a += b
  • 大规模随机需设种子 np.random.seed() 保复现。
  • 使用 np.memmap 处理超大数据盘上映射。

17.2 示例对比

import numpy as np, time

N = 1_000_000
a = np.random.rand(N); b = np.random.rand(N)

# 矢量化
t0 = time.time()
c = a + b
print("vectorized:", time.time() - t0)

# 循环
t0 = time.time()
res = [a[i] + b[i] for i in range(N)]
print("python loop:", time.time() - t0)

# dtype 控制
x64 = np.random.rand(N).astype(np.float64)
x32 = np.random.rand(N).astype(np.float32)
print(x64.nbytes, x32.nbytes)  # 内存差距

17.3 广播 & 内置

v = np.arange(5)
print(v + 10)          # 广播标量
M = np.arange(12).reshape(3,4)
row_mean = M.mean(axis=1)
col_sum  = M.sum(axis=0)

17.4 练习

a = np.random.rand(1_000_000)
b = np.random.rand(1_000_000)
c = a + b
mean_val = np.mean(a)
print(mean_val, c[:3])
{
  "title": {"text": "性能示意(相对耗时)"},
  "tooltip": {},
  "xAxis": {"type": "category", "data": ["vectorized","loop"]},
  "yAxis": {"type": "value"},
  "series": [
    {"type": "bar","name":"time_ratio","data":[1,30]}
  ]
}

18 实际案例分析

18.1 数据预处理与清洗

关键步骤:缺失值识别→填充→去重→类型转换→统计。

import numpy as np, pandas as pd

data = {
    "Product": ["A","B","C","A","B","C","A",None,"B"],
    "Sales":   [250,150,np.nan,300,200,np.nan,350,400,450],
    "Profit":  [50,30,10,70,20,5,np.nan,80,90],
    "Quantity":[10,5,2,15,7,np.nan,20,25,30]
}
df = pd.DataFrame(data)
miss = df.isnull().sum()
df["Sales"]    = df["Sales"].fillna(df["Sales"].mean())
df["Profit"]   = df["Profit"].fillna(df["Profit"].mean())
df["Quantity"] = df["Quantity"].fillna(df["Quantity"].median())
df["Product"]  = df["Product"].fillna("Unknown")
df = df.drop_duplicates()
df["Sales"] = df["Sales"].astype(float)
df["Profit"] = df["Profit"].astype(float)
df["Quantity"] = df["Quantity"].astype(int)

18.2 可视化替换

销售额与利润散点、产品总销售、数量分布箱形。

{
  "title":{"text":"Sales vs Profit (by Product)"},
  "tooltip":{"trigger":"item"},
  "legend":{},
  "xAxis":{"type":"value","name":"Sales"},
  "yAxis":{"type":"value","name":"Profit"},
  "series":[
    {
      "type":"scatter",
      "name":"points",
      "data":[[250,50],[150,30],[300,70],[200,20],[350,60],[400,80],[450,90]]
    }
  ]
}
{
  "title": {"text": "Total Sales by Product"},
  "tooltip": {"trigger": "axis"},
  "xAxis": {
    "type": "category",
    "data": ["A", "B", "C", "Unknown"]
  },
  "yAxis": {"type": "value"},
  "series": [
    {
      "name": "Total Sales",
      "type": "bar",
      "data": [900, 800, 600, 400],
      "itemStyle": {
        "color": "#5470C6"
      }
    }
  ]
}
{
  "title": {"text": "Quantity Distribution (Boxplot)"},
  "tooltip": {"trigger": "item"},
  "xAxis": {
    "type": "category",
    "data": ["A", "B", "C", "Unknown"]
  },
  "yAxis": {"type": "value", "name": "Quantity"},
  "series": [
    {
      "name": "Quantity",
      "type": "boxplot",
      "data": [
        [10, 10, 15, 20, 20],
        [5, 5, 7, 30, 30],
        [2, 2, 7, 12, 12],
        [25, 25, 25, 25, 25]
      ]
    }
  ]
}

18.3 NumPy统计分析

sales    = df["Sales"].to_numpy()
profit   = df["Profit"].to_numpy()
quantity = df["Quantity"].to_numpy()

mean_sales = sales.mean()
std_sales  = sales.std()
profit_margin = profit / sales
corr = np.corrcoef(sales, profit)[0,1]
total_sales  = sales.sum()
total_profit = profit.sum()
{
  "title":{"text":"Profit Margin Distribution"},
  "tooltip":{"trigger":"axis"},
  "xAxis":{"type":"category","data":["A1","B1","C1","A2","B2","C2","A3","Unknown","B3"]},
  "yAxis":{"type":"value"},
  "series":[
    {"type":"bar",
     "data":[0.2,0.2,0.033,0.233,0.1,0.033,0.171,0.2,0.2]
    }
  ]
}

18.4 完整清洗 + 分析核心

import numpy as np, pandas as pd
df = pd.DataFrame(data)
df = (df
      .assign(Sales=lambda d: d["Sales"].fillna(d["Sales"].mean()),
              Profit=lambda d: d["Profit"].fillna(d["Profit"].mean()),
              Quantity=lambda d: d["Quantity"].fillna(d["Quantity"].median()),
              Product=lambda d: d["Product"].fillna("Unknown"))
      .drop_duplicates())
sales, profit = df["Sales"].to_numpy(), df["Profit"].to_numpy()
pm = profit / sales
corr = np.corrcoef(sales, profit)[0,1]

19 练习题

19.1 K-Means 聚类 (简实现)

import numpy as np
np.random.seed(0)
X = np.random.rand(300,2)

def kmeans(X, k, max_iters=100):
    centroids = X[np.random.choice(len(X), k, replace=False)]
    for _ in range(max_iters):
        dist = np.linalg.norm(X[:,None] - centroids, axis=2)   # shape (n,k)
        labels = dist.argmin(axis=1)
        new_centroids = np.array([X[labels==i].mean(axis=0) for i in range(k)])
        if np.allclose(centroids, new_centroids, atol=1e-6):
            break
        centroids = new_centroids
    return labels, centroids

labels, centers = kmeans(X, 3)
{
  "title":{"text":"K-Means 聚类示意"},
  "xAxis":{"type":"value"},
  "yAxis":{"type":"value"},
  "series":[
    {"type":"scatter","name":"cluster0","data":[[0.1,0.2],[0.15,0.25]]},
    {"type":"scatter","name":"cluster1","data":[[0.7,0.8],[0.75,0.77]]},
    {"type":"scatter","name":"cluster2","data":[[0.4,0.5],[0.45,0.55]]},
    {"type":"scatter","name":"centroids","data":[[0.13,0.23],[0.72,0.78],[0.43,0.53]],
     "symbolSize":14,"itemStyle":{"color":"red"}}
  ]
}

19.2 PCA 降维

data = np.random.rand(100,5)
centered = data - data.mean(axis=0)
cov = np.cov(centered, rowvar=False)
eigvals, eigvecs = np.linalg.eig(cov)
top = eigvecs[:, eigvals.argsort()[-2:]]   # 取最大两主成分
reduced = centered @ top
{
  "title":{"text":"PCA 前两主成分散点"},
  "xAxis":{"type":"value","name":"PC1"},
  "yAxis":{"type":"value","name":"PC2"},
  "series":[
    {"type":"scatter","data":[[0.1,0.05],[0.2,-0.03],[0.15,0.08]]}
  ]
}

19.3 时间序列移动平均

import pandas as pd, numpy as np
dates = pd.date_range("2021-01-01", periods=100)
values = np.random.rand(100) * 100
ts = pd.Series(values, index=dates)
ma7 = ts.rolling(7).mean()
{
  "title":{"text":"7日移动平均示意"},
  "tooltip":{"trigger":"axis"},
  "legend":{"data":["raw","ma7"]},
  "xAxis":{"type":"category","data":["d1","d20","d40","d60","d80","d100"]},
  "yAxis":{"type":"value"},
  "series":[
    {"type":"line","name":"raw","data":[50,60,55,65,58,62]},
    {"type":"line","name":"ma7","data":[52,57,56,60,59,61]}
  ]
}

19.4 标准化与归一化

from sklearn.preprocessing import StandardScaler, MinMaxScaler
data = np.random.rand(10,2) * 100
std = StandardScaler().fit_transform(data)
norm = MinMaxScaler().fit_transform(data)

19.5 t 检验

from scipy import stats
s1 = np.random.normal(50,10,100)
s2 = np.random.normal(55,10,100)
t_stat, p_val = stats.ttest_ind(s1, s2)

19.6 自定义函数作用于数组

def custom_operation(x):
    return np.sin(x) + np.log1p(x)  # log1p(x) 安全处理 x≈0
x = np.linspace(0,10,100)
y = custom_operation(x)
{
  "title":{"text":"f(x)=sin(x)+log(1+x)"},
  "xAxis":{"type":"category","data":["0","2","4","6","8","10"]},
  "yAxis":{"type":"value"},
  "series":[{"type":"line","data":[0,1.46,1.03,0.42,0.97,1.30]}]
}

19.7 多维数组列归一化

def normalize(col):
    return (col - col.min()) / (col.max() - col.min() + 1e-12)
data = np.random.rand(5,3) * 100
norm_cols = np.apply_along_axis(normalize, 0, data)

19.8 多项式拟合

x = np.linspace(0,10,30)
y = 2*x + np.random.randn(30)*2
coef = np.polyfit(x,y,1)
poly = np.poly1d(coef)
y_fit = poly(x)
{
  "title":{"text":"线性拟合示意"},
  "xAxis":{"type":"value"},
  "yAxis":{"type":"value"},
  "series":[
    {"type":"scatter","data":[[0,0],[5,11],[10,19]]},
    {"type":"line","data":[[0,1],[10,20]]}
  ]
}

19.9 DBSCAN 聚类 (调用库版)

from sklearn.datasets import make_moons
from sklearn.cluster import DBSCAN
X, _ = make_moons(n_samples=300, noise=0.1)
labels = DBSCAN(eps=0.2, min_samples=5).fit_predict(X)
{
  "title":{"text":"DBSCAN 聚类示意"},
  "xAxis":{"type":"value"},
  "yAxis":{"type":"value"},
  "series":[
    {"type":"scatter","name":"clusterA","data":[[0.1,0.2],[0.2,0.25]]},
    {"type":"scatter","name":"clusterB","data":[[1.0,0.3],[0.9,0.35]]}
  ]
}

19.10 ARIMA 时间序列预测 (简示)

import numpy as np, pandas as pd
from statsmodels.tsa.arima.model import ARIMA
series = np.random.randn(100).cumsum()
dates = pd.date_range("2021-01-01", periods=100)
ts = pd.Series(series, index=dates)
model = ARIMA(ts, order=(1,1,1)).fit()
forecast = model.forecast(steps=10)
{
  "title":{"text":"ARIMA 预测示意"},
  "xAxis":{"type":"category","data":["t90","t92","t94","t96","t98","t100","t101","t102","t103","t104","t105"]},
  "yAxis":{"type":"value"},
  "legend":{"data":["actual","forecast"]},
  "series":[
    {"type":"line","name":"actual","data":[90,91,92,93,94,95]},
    {"type":"line","name":"forecast","data":[95.2,95.4,95.6,95.7,95.8]}
  ]
}