$$ STFT_x(t_0, u) = \int_{-\infin}^{+\infin} x(t) \cdot W(t-t_0) \cdot {\rm e}^{-{\rm j} 2 \pi u t} {\rm d} t $$
下面是对每个采样点处都做了插值的方案。
def stfs_amp_sample_res(frames, profile, sr, frame_length, hop_length):
freqs = np.fft.rfftfreq(frame_length, d=1./sr)
freq_num = freqs.shape[0] # should be frame_length // 2 + 1
X = np.empty(shape=(len(frames), freq_num), dtype=np.float64)
for m, frame in tqdm(list(enumerate(frames)), ascii=True):
X[m] = np.abs(np.fft.rfft(frame))
samples_num = (len(frames) - 1) * hop_length + frame_length
times = np.linspace(0, samples_num / sr, samples_num, endpoint=False)
img_amp = np.zeros(shape=(samples_num, freq_num), dtype=np.float64)
hamming_window = np.hamming(frame_length)
for m in tqdm(list(range(len(frames))), ascii=True):
begin_sample = hop_length * m
oend_sample = begin_sample + frame_length
img_amp[begin_sample:oend_sample, :] += X[m, :][np.newaxis, :] * hamming_window[:, np.newaxis]
# if standardation needed
for n in tqdm(list(range(samples_num)), ascii=True):
img_amp[n] /= profile[n]['hamming'].sum().item()
return img_amp, freqs, times
下面是展示幅值图像的方法。
def db_to_amp(db):
return 10.0 ** (db / 20.0)
def log_mapping(x, ref_db=-60, max_db=0):
'''
Args:
x: number or array-like, pre-normalization required
max_db: default 0, if db_of_any_x > max_db throws warning message
'''
x = np.array(x)
if x.max() > db_to_amp(max_db):
print('Warning: db_of_any_x > max_db at log_mapping()', file=sys.stderr)
y = 20.0 * np.log10(np.maximum(db_to_amp(ref_db), x))
# 确保所有值都在 0-1 范围内
return np.clip((y - ref_db) / -ref_db, 0, 1)
def print_img_amp(img_amp, times, freqs, figsize=(12, 6), min_db=60, db_tick=10, freq_tick=10, freq_max=None, cm=plt.cm.hot):
img = img_amp.T
img /= img.max()
img = log_mapping(img, ref_db=-min_db)
img = (img * 255).astype(np.uint8)
# import PIL
# PIL.Image.fromarray(img[:, ::48]).save('temp.png')
if freq_max is None:
freq_max = img.shape[0]
fig, ax = plt.subplots(figsize=figsize)
pc = ax.imshow(img[:freq_max, ::48], cm, aspect='auto')
# 设置xticks和yticks
xticks = np.arange(0, times[::48].shape[0], 100, dtype=int)
ax.set_xticks(xticks, ["%.2f" % it for it in times[::48][xticks]])
ax.set_xlabel("time (s)")
yticks = np.arange(0, freqs[:freq_max].shape[0], freq_tick, dtype=int)
ax.set_yticks(yticks, freqs[:freq_max][yticks].round().astype(int).astype(str))
ax.set_ylabel("freq (Hz)")
ax.invert_yaxis()
tick_db = np.linspace(0, -min_db, int(round(min_db/db_tick)) + 1)
tick_vals = (tick_db + min_db) / min_db * 255
tick_labels = ["%d dB" % round(it) for it in tick_db]
cbar = fig.colorbar(pc, ax=ax, ticks=tick_vals)
cbar.ax.set_yticklabels(tick_labels)
ax.set_title('$|X[m, k]|$ (in dB)')
fig.tight_layout()
图中的竖向线貌似不是由于窗函数或者增长度跳距问题引起的。
下面是通过公式直接计算、而不是经过 FFT 计算的结果。基本看不出差别。
X_comp = np.empty(shape=(len(frames), freq_num), dtype=np.complex128)
for m, frame in tqdm(list(enumerate(frames)), ascii=True):
w_index = -1j * (2 * np.pi / len(frame))
for k in range(0, freq_num):
ns = np.arange(len(frame))
X_comp[m, k] = (frame * np.exp((w_index * k) * ns)).sum()
X_amp = np.abs(X_comp)
此图是错误地在插值中想要引入相位变化的后果。大部分谱线都消失了。