노현욱

melspectogram showing codes

......@@ -6,17 +6,21 @@ import time
import librosa
import tkinter as tk
ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/ping.mp3").raw_data, dtype=np.int16)
ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/s1.mp3").raw_data, dtype=np.int16)
# 파라미터 설정
RATE = 44100 # 샘플링 주파수
CHUNK = 1024 # 읽을 샘플의 수
THRESHOLD = 256 # 피크를 검출하기 위한 threshold 값
THRESHOLD = 128 # 피크를 검출하기 위한 threshold 값
WIN_SIZE = 1024 # STFT를 적용할 윈도우 사이즈
HOP_SIZE = 512 # STFT에서 윈도우 사이의 거리 (오버랩 사이즈)
DELAY = 0.1 # Delay time in seconds
MAX_FREQ = 3000 # max freq for pitch shifting
MAX_HEIGHT = 2000 # max height for pitch shifting
MAX_FREQ = 10000 # max freq for pitch shifting
MAX_HEIGHT = 10000 # max height for pitch shifting
MAX_DECIBEL = 50 # max decibel for decibel shifting
SOURCE_MODE = "decibel" # height, decibel or frequency
MODE = "low_filter" # low_filter, echo or pitch_shift
SOUND_SIZE = len(ORIGIN_SOUND) # 음원 길이
sound_idx = 0
......@@ -56,10 +60,8 @@ def get_user_audio(duration):
return sound
ORIGIN_SOUND = get_user_audio(0.5)
SOURCE_MODE = "decibel" # decibel or frequency
MODE = "high_filter" # echo or pitch_shift
SOUND_SIZE = len(ORIGIN_SOUND) # 음원 길이
if "y" == input("직접 녹음을 하시겠습니까? (y/n) : "):
ORIGIN_SOUND = get_user_audio(0.2)
sound = ORIGIN_SOUND.copy()
......@@ -67,24 +69,27 @@ print(type(sound), len(sound))
p = pyaudio.PyAudio()
last_frame = 0
last_time = 0
# 콜백 함수 정의
def process_audio(in_data, frame_count, time_info, status):
global buffer
global sound
global sound_idx
global last_frame
global last_time
def get_distortion(height, frequency):
height = min(height, MAX_HEIGHT) / MAX_HEIGHT
def get_distortion(height, frequency, decibel):
height = min(height, MAX_HEIGHT) / RATE
frequency = min(frequency, MAX_FREQ) / MAX_FREQ
decibel = min(decibel, MAX_DECIBEL) / MAX_DECIBEL
if SOURCE_MODE == "decibel":
if SOURCE_MODE == "height":
param = height
elif SOURCE_MODE == "frequency":
param = frequency
elif SOURCE_MODE == "decibel":
param = decibel
else:
return ORIGIN_SOUND
......@@ -110,16 +115,17 @@ def process_audio(in_data, frame_count, time_info, status):
return echoed_samples
def shift_pitch(frequency):
pitch_shift_factor = frequency
pitch_shift_factor = frequency * 3
audio_array = ORIGIN_SOUND.copy()
# Resample the audio array to change the pitch
resampled_array = librosa.effects.pitch_shift(np.array(audio_array, dtype=np.float32), sr=RATE, n_steps=pitch_shift_factor, bins_per_octave=1)
return np.array(resampled_array, dtype=np.int16)
def low_filter(param):
audio_data = data
audio_data = np.array(ORIGIN_SOUND.copy(), dtype=np.float32)
# Define the filter parameters
cutoff_freq = param * MAX_FREQ # Frequency cutoff for the low-pass filter (in Hz)
cutoff_freq = param * RATE # Frequency cutoff for the low-pass filter (in Hz)
print("cut of below : ", cutoff_freq)
nyquist_freq = 0.5 * RATE # Nyquist frequency (half of the sampling rate)
normalized_cutoff = cutoff_freq / nyquist_freq # Normalized cutoff frequency
......@@ -129,7 +135,7 @@ def process_audio(in_data, frame_count, time_info, status):
# Apply the low-pass filter to the audio data
filtered_audio = signal.lfilter(b, a, audio_data)
return filtered_audio
return np.array(filtered_audio, dtype=np.int16)
# 오디오 데이터 변환
data = np.frombuffer(in_data, dtype=np.int16)
......@@ -140,18 +146,19 @@ def process_audio(in_data, frame_count, time_info, status):
# 피크 검출
peaks, _ = signal.find_peaks(np.abs(np.mean(Zxx, axis=1)), height=THRESHOLD, distance=WIN_SIZE)
# 파라미터 추정
if len(peaks) > 0 and last_frame+1 != frame_count:
last_frame = frame_count
if len(peaks) > 0 and last_time+0.1 < time_info['current_time']:
peak_idx = peaks[0] # 첫 번째 피크 선택
height = np.abs(Zxx[peak_idx, 0]) # 피크의 높이 추정
freq = f[peak_idx] # 피크의 주파수 추정
amp = np.max(np.abs(data)) # 신호의 진폭 추정
decibel = np.mean(librosa.amplitude_to_db(np.abs(Zxx))) # 진폭을 데시벨로 변환
if(decibel > 20):
print("Height: {:.2f}, 주파수: {:.2f}, Amplitude: {:.2f}, Decibel: {:.2f}".format(height, freq, amp, decibel))
new_sound = get_distortion(height, freq)
if(decibel > 10) and height > 100:
last_time = time_info['current_time']
print("Height: {:.2f}, 주파수: {:.2f}, Amplitude: {:.2f}, Decibel: {:.2f}, time_info {:.2f}".format(height, freq, amp, decibel, time_info['current_time']))
new_sound = get_distortion(height, freq, decibel)
if(sound_idx > len(sound)):
sound = new_sound
sound_idx = 0
else:
mixed_end = min(len(sound), sound_idx + len(new_sound))
......@@ -161,7 +168,7 @@ def process_audio(in_data, frame_count, time_info, status):
result = np.concatenate((sound, new_sound[mixed_end-sound_idx:]),axis=0)
sound = result
elif len(peaks) > 0:
last_frame = frame_count
last_time = time_info['current_time']
sound_idx += 1024
if sound_idx > len(sound):
......
......@@ -5,6 +5,7 @@ import pydub
import time
import librosa
import tkinter as tk
import threading
ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/ping.mp3").raw_data, dtype=np.int16)
......@@ -25,29 +26,26 @@ window.title("Sound Effect")
window.geometry("640x400+100+100")
window.resizable(False, False)
info_text = tk.StringVar()
info_text.set("welcome! please press record button.")
info_label = tk.Button(window, textvariable=info_text, foreground="black", background="white")
info_label = tk.Button(window, width=50, height=10, bg="white", fg="black")
info_label.pack()
info_text = tk.StringVar()
info_text.set("ready for recording...")
info_label.config(textvariable=info_text)
def set_source_mode(mode):
global SOURCE_MODE
SOURCE_MODE = mode
# 사용자의 목소리를 duration 초간 녹음.
def get_user_audio(duration):
global info_text, info_label
global info_text, info_label, ORIGIN_SOUND
frames = []
p = pyaudio.PyAudio()
# 카운터 시작
info_text.set("ready for recording...")
for _ in range(3, 0, -1):
info_text.set(str(_))
time.sleep(1)
info_text.set("start...")
# 실제 녹음 콜백 함수
def add_to_frame(in_data, frame_count, time_info, status):
......@@ -69,17 +67,19 @@ def get_user_audio(duration):
stream.close()
p.terminate()
info_text.set("start...")
return sound
def record():
global ORIGIN_SOUND
global SOURCE_MODE
global ORIGIN_SOUND, SOURCE_MODE, MODE
ORIGIN_SOUND = get_user_audio(0.5)
SOURCE_MODE = "decibel" # decibel or frequency
SOURCE_MODE = "frequency" # decibel or frequency
MODE = "pitch_shift"
def start():
global MODE, SOUND_SIZE, sound_idx, sound
MODE = "high_filter" # echo or pitch_shift
global MODE, SOUND_SIZE, sound_idx, sound, ORIGIN_SOUND, last_frame, SOURCE_MODE, MODE
SOUND_SIZE = len(ORIGIN_SOUND) # 음원 길이
sound = ORIGIN_SOUND.copy()
......@@ -92,8 +92,7 @@ def start():
# 콜백 함수 정의
def process_audio(in_data, frame_count, time_info, status):
global sound_idx, sound, last_frame, ORIGIN_SOUND, MODE, SOURCE_MODE
def get_distortion(height, frequency):
height = min(height, MAX_HEIGHT) / MAX_HEIGHT
frequency = min(frequency, MAX_FREQ) / MAX_FREQ
......@@ -216,6 +215,9 @@ def start():
p.terminate()
record_button = tk.Button(window, text="Record", width=10, height=2, command=lambda: record())
record_button.pack()
......@@ -225,6 +227,7 @@ decibel_button.pack()
frequency_button = tk.Button(window, text="Frequency", width=10, height=2, command = lambda: set_source_mode("frequency"))
frequency_button.pack()
#start_button = tk.Button(window, text="Start", width=10, height=2, command=lambda: threading.Thread(target=start).start())
start_button = tk.Button(window, text="Start", width=10, height=2, command=lambda: start())
start_button.pack()
......
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
# Load the audio file
audio_file = "./sounds/s4.mp3"
audio, sr = librosa.load(audio_file, sr=None)
# Echo parameters
delay = int(0.2 * sr) # Echo delay in samples
decay1 = 0.8 # Echo decay factor 1
decay2 = 0.3 # Echo decay factor 2
# Apply echo effect with decay factor 1
echoed_audio1 = np.zeros_like(audio)
echoed_audio1[delay:] = audio[:-delay] + decay1 * audio[delay:]
# Apply echo effect with decay factor 2
echoed_audio2 = np.zeros_like(audio)
echoed_audio2[delay:] = audio[:-delay] + decay2 * audio[delay:]
# Compute the Mel spectrogram of the original audio
mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
# Compute the Mel spectrogram of the echoed audio with decay factor 1
mel_spec_echoed1 = librosa.feature.melspectrogram(y=echoed_audio1, sr=sr)
# Compute the Mel spectrogram of the echoed audio with decay factor 2
mel_spec_echoed2 = librosa.feature.melspectrogram(y=echoed_audio2, sr=sr)
# Convert to dB scale
mel_spec_orig_db = librosa.power_to_db(mel_spec_orig, ref=np.max)
mel_spec_echoed1_db = librosa.power_to_db(S=mel_spec_echoed1, ref=np.max)
mel_spec_echoed2_db = librosa.power_to_db(S=mel_spec_echoed2, ref=np.max)
# Display the Mel spectrograms
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
librosa.display.specshow(mel_spec_orig_db, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Original Mel Spectrogram')
plt.subplot(1, 2, 2)
librosa.display.specshow(mel_spec_echoed1_db, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Echoed Mel Spectrogram (Decay 1)')
# plt.subplot(1, 3, 3)
# librosa.display.specshow(mel_spec_echoed2_db, sr=sr, x_axis='time', y_axis='mel')
# plt.colorbar(format='%+2.0f dB')
# plt.title('Echoed Mel Spectrogram (Decay 2)')
plt.tight_layout()
plt.show()
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from scipy.signal import butter, lfilter
import sounddevice as sd
def apply_low_pass_filter(data, cutoff_freq, sample_rate, order=5):
print(type(data), data.shape, data.dtype)
# Calculate the digital cutoff frequency
nyquist_freq = 0.5 * sample_rate
digital_cutoff = cutoff_freq / nyquist_freq
# Create the filter coefficients using Butterworth filter
b, a = butter(order, digital_cutoff, btype='low', analog=False)
# Apply the filter to the data
filtered_data = lfilter(b, a, data)
return filtered_data
# Load the audio file
audio_file = "./sounds/s4.mp3"
audio, sr = librosa.load(audio_file, sr=None)
# Filter parameters
cutoff_frequency = 5000 # Cutoff frequency in Hz
sample_rate = sr
# Apply the low-pass filter
filtered_audio = apply_low_pass_filter(audio, cutoff_frequency, sample_rate)
# Play the filtered audio
sd.play(filtered_audio, sample_rate)
# Compute the Mel spectrogram of the original audio
mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
mel_spec_orig_db = librosa.power_to_db(S=mel_spec_orig, ref=np.max)
# Compute the Mel spectrogram of the filtered audio
mel_spec_filtered = librosa.feature.melspectrogram(y=filtered_audio, sr=sr)
mel_spec_filtered_db = librosa.power_to_db(S=mel_spec_filtered, ref=np.max)
# Plotting the original and filtered signals
plt.figure(figsize=(12, 8))
# Original Signal
plt.subplot(2, 2, 1)
plt.plot(audio)
plt.title('Original Signal')
plt.xlabel('Time')
plt.ylabel('Amplitude')
# Filtered Signal
plt.subplot(2, 2, 2)
plt.plot(filtered_audio)
plt.title('Filtered Signal')
plt.xlabel('Time')
plt.ylabel('Amplitude')
# Plotting the Mel spectrograms
plt.subplot(2, 2, 3)
librosa.display.specshow(mel_spec_orig_db, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Original Mel Spectrogram')
plt.subplot(2, 2, 4)
librosa.display.specshow(mel_spec_filtered_db, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Filtered Mel Spectrogram')
plt.tight_layout()
plt.show()
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
# Load the audio file
audio_file = "./sounds/s4.mp3"
audio, sr = librosa.load(audio_file, sr=None)
# Compute the original Mel spectrogram
mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
# Pitch shift parameters
pitch_shift_factor = 2.0
# Apply pitch shifting
audio_pitch_shifted = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=pitch_shift_factor, bins_per_octave=1)
mel_spec_pitch_shifted = librosa.feature.melspectrogram(y=audio_pitch_shifted, sr=sr)
# Display the original Mel spectrogram
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
librosa.display.specshow(librosa.power_to_db(mel_spec_orig, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
plt.title('Original Mel Spectrogram')
#plt.colorbar(format='%+2.0f dB')
# Display the pitch-shifted Mel spectrogram
plt.subplot(1, 2, 2)
librosa.display.specshow(librosa.power_to_db(mel_spec_pitch_shifted, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
plt.title('Pitch-Shifted Mel Spectrogram')
#plt.colorbar(format='%+2.0f dB')
plt.tight_layout()
plt.show()