노현욱

melspectogram showing codes

...@@ -6,17 +6,21 @@ import time ...@@ -6,17 +6,21 @@ import time
6 import librosa 6 import librosa
7 import tkinter as tk 7 import tkinter as tk
8 8
9 -ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/ping.mp3").raw_data, dtype=np.int16) 9 +ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/s1.mp3").raw_data, dtype=np.int16)
10 10
11 # 파라미터 설정 11 # 파라미터 설정
12 RATE = 44100 # 샘플링 주파수 12 RATE = 44100 # 샘플링 주파수
13 CHUNK = 1024 # 읽을 샘플의 수 13 CHUNK = 1024 # 읽을 샘플의 수
14 -THRESHOLD = 256 # 피크를 검출하기 위한 threshold 값 14 +THRESHOLD = 128 # 피크를 검출하기 위한 threshold 값
15 WIN_SIZE = 1024 # STFT를 적용할 윈도우 사이즈 15 WIN_SIZE = 1024 # STFT를 적용할 윈도우 사이즈
16 HOP_SIZE = 512 # STFT에서 윈도우 사이의 거리 (오버랩 사이즈) 16 HOP_SIZE = 512 # STFT에서 윈도우 사이의 거리 (오버랩 사이즈)
17 DELAY = 0.1 # Delay time in seconds 17 DELAY = 0.1 # Delay time in seconds
18 -MAX_FREQ = 3000 # max freq for pitch shifting 18 +MAX_FREQ = 10000 # max freq for pitch shifting
19 -MAX_HEIGHT = 2000 # max height for pitch shifting 19 +MAX_HEIGHT = 10000 # max height for pitch shifting
20 +MAX_DECIBEL = 50 # max decibel for decibel shifting
21 +SOURCE_MODE = "decibel" # height, decibel or frequency
22 +MODE = "low_filter" # low_filter, echo or pitch_shift
23 +SOUND_SIZE = len(ORIGIN_SOUND) # 음원 길이
20 24
21 sound_idx = 0 25 sound_idx = 0
22 26
...@@ -56,10 +60,8 @@ def get_user_audio(duration): ...@@ -56,10 +60,8 @@ def get_user_audio(duration):
56 60
57 return sound 61 return sound
58 62
59 -ORIGIN_SOUND = get_user_audio(0.5) 63 +if "y" == input("직접 녹음을 하시겠습니까? (y/n) : "):
60 -SOURCE_MODE = "decibel" # decibel or frequency 64 + ORIGIN_SOUND = get_user_audio(0.2)
61 -MODE = "high_filter" # echo or pitch_shift
62 -SOUND_SIZE = len(ORIGIN_SOUND) # 음원 길이
63 65
64 sound = ORIGIN_SOUND.copy() 66 sound = ORIGIN_SOUND.copy()
65 67
...@@ -67,24 +69,27 @@ print(type(sound), len(sound)) ...@@ -67,24 +69,27 @@ print(type(sound), len(sound))
67 69
68 p = pyaudio.PyAudio() 70 p = pyaudio.PyAudio()
69 71
70 -last_frame = 0 72 +last_time = 0
71 73
72 # 콜백 함수 정의 74 # 콜백 함수 정의
73 def process_audio(in_data, frame_count, time_info, status): 75 def process_audio(in_data, frame_count, time_info, status):
74 global buffer 76 global buffer
75 global sound 77 global sound
76 global sound_idx 78 global sound_idx
77 - global last_frame 79 + global last_time
78 80
79 81
80 - def get_distortion(height, frequency): 82 + def get_distortion(height, frequency, decibel):
81 - height = min(height, MAX_HEIGHT) / MAX_HEIGHT 83 + height = min(height, MAX_HEIGHT) / RATE
82 frequency = min(frequency, MAX_FREQ) / MAX_FREQ 84 frequency = min(frequency, MAX_FREQ) / MAX_FREQ
85 + decibel = min(decibel, MAX_DECIBEL) / MAX_DECIBEL
83 86
84 - if SOURCE_MODE == "decibel": 87 + if SOURCE_MODE == "height":
85 param = height 88 param = height
86 elif SOURCE_MODE == "frequency": 89 elif SOURCE_MODE == "frequency":
87 param = frequency 90 param = frequency
91 + elif SOURCE_MODE == "decibel":
92 + param = decibel
88 else: 93 else:
89 return ORIGIN_SOUND 94 return ORIGIN_SOUND
90 95
...@@ -110,16 +115,17 @@ def process_audio(in_data, frame_count, time_info, status): ...@@ -110,16 +115,17 @@ def process_audio(in_data, frame_count, time_info, status):
110 return echoed_samples 115 return echoed_samples
111 116
112 def shift_pitch(frequency): 117 def shift_pitch(frequency):
113 - pitch_shift_factor = frequency 118 + pitch_shift_factor = frequency * 3
114 audio_array = ORIGIN_SOUND.copy() 119 audio_array = ORIGIN_SOUND.copy()
115 # Resample the audio array to change the pitch 120 # Resample the audio array to change the pitch
116 resampled_array = librosa.effects.pitch_shift(np.array(audio_array, dtype=np.float32), sr=RATE, n_steps=pitch_shift_factor, bins_per_octave=1) 121 resampled_array = librosa.effects.pitch_shift(np.array(audio_array, dtype=np.float32), sr=RATE, n_steps=pitch_shift_factor, bins_per_octave=1)
117 return np.array(resampled_array, dtype=np.int16) 122 return np.array(resampled_array, dtype=np.int16)
118 123
119 def low_filter(param): 124 def low_filter(param):
120 - audio_data = data 125 + audio_data = np.array(ORIGIN_SOUND.copy(), dtype=np.float32)
121 # Define the filter parameters 126 # Define the filter parameters
122 - cutoff_freq = param * MAX_FREQ # Frequency cutoff for the low-pass filter (in Hz) 127 + cutoff_freq = param * RATE # Frequency cutoff for the low-pass filter (in Hz)
128 + print("cut of below : ", cutoff_freq)
123 nyquist_freq = 0.5 * RATE # Nyquist frequency (half of the sampling rate) 129 nyquist_freq = 0.5 * RATE # Nyquist frequency (half of the sampling rate)
124 normalized_cutoff = cutoff_freq / nyquist_freq # Normalized cutoff frequency 130 normalized_cutoff = cutoff_freq / nyquist_freq # Normalized cutoff frequency
125 131
...@@ -129,7 +135,7 @@ def process_audio(in_data, frame_count, time_info, status): ...@@ -129,7 +135,7 @@ def process_audio(in_data, frame_count, time_info, status):
129 # Apply the low-pass filter to the audio data 135 # Apply the low-pass filter to the audio data
130 filtered_audio = signal.lfilter(b, a, audio_data) 136 filtered_audio = signal.lfilter(b, a, audio_data)
131 137
132 - return filtered_audio 138 + return np.array(filtered_audio, dtype=np.int16)
133 139
134 # 오디오 데이터 변환 140 # 오디오 데이터 변환
135 data = np.frombuffer(in_data, dtype=np.int16) 141 data = np.frombuffer(in_data, dtype=np.int16)
...@@ -140,18 +146,19 @@ def process_audio(in_data, frame_count, time_info, status): ...@@ -140,18 +146,19 @@ def process_audio(in_data, frame_count, time_info, status):
140 # 피크 검출 146 # 피크 검출
141 peaks, _ = signal.find_peaks(np.abs(np.mean(Zxx, axis=1)), height=THRESHOLD, distance=WIN_SIZE) 147 peaks, _ = signal.find_peaks(np.abs(np.mean(Zxx, axis=1)), height=THRESHOLD, distance=WIN_SIZE)
142 # 파라미터 추정 148 # 파라미터 추정
143 - if len(peaks) > 0 and last_frame+1 != frame_count: 149 + if len(peaks) > 0 and last_time+0.1 < time_info['current_time']:
144 - last_frame = frame_count
145 peak_idx = peaks[0] # 첫 번째 피크 선택 150 peak_idx = peaks[0] # 첫 번째 피크 선택
146 height = np.abs(Zxx[peak_idx, 0]) # 피크의 높이 추정 151 height = np.abs(Zxx[peak_idx, 0]) # 피크의 높이 추정
147 freq = f[peak_idx] # 피크의 주파수 추정 152 freq = f[peak_idx] # 피크의 주파수 추정
148 amp = np.max(np.abs(data)) # 신호의 진폭 추정 153 amp = np.max(np.abs(data)) # 신호의 진폭 추정
149 decibel = np.mean(librosa.amplitude_to_db(np.abs(Zxx))) # 진폭을 데시벨로 변환 154 decibel = np.mean(librosa.amplitude_to_db(np.abs(Zxx))) # 진폭을 데시벨로 변환
150 155
151 - if(decibel > 20): 156 + if(decibel > 10) and height > 100:
152 - print("Height: {:.2f}, 주파수: {:.2f}, Amplitude: {:.2f}, Decibel: {:.2f}".format(height, freq, amp, decibel)) 157 + last_time = time_info['current_time']
153 - new_sound = get_distortion(height, freq) 158 + print("Height: {:.2f}, 주파수: {:.2f}, Amplitude: {:.2f}, Decibel: {:.2f}, time_info {:.2f}".format(height, freq, amp, decibel, time_info['current_time']))
159 + new_sound = get_distortion(height, freq, decibel)
154 if(sound_idx > len(sound)): 160 if(sound_idx > len(sound)):
161 + sound = new_sound
155 sound_idx = 0 162 sound_idx = 0
156 else: 163 else:
157 mixed_end = min(len(sound), sound_idx + len(new_sound)) 164 mixed_end = min(len(sound), sound_idx + len(new_sound))
...@@ -161,7 +168,7 @@ def process_audio(in_data, frame_count, time_info, status): ...@@ -161,7 +168,7 @@ def process_audio(in_data, frame_count, time_info, status):
161 result = np.concatenate((sound, new_sound[mixed_end-sound_idx:]),axis=0) 168 result = np.concatenate((sound, new_sound[mixed_end-sound_idx:]),axis=0)
162 sound = result 169 sound = result
163 elif len(peaks) > 0: 170 elif len(peaks) > 0:
164 - last_frame = frame_count 171 + last_time = time_info['current_time']
165 172
166 sound_idx += 1024 173 sound_idx += 1024
167 if sound_idx > len(sound): 174 if sound_idx > len(sound):
......
...@@ -5,6 +5,7 @@ import pydub ...@@ -5,6 +5,7 @@ import pydub
5 import time 5 import time
6 import librosa 6 import librosa
7 import tkinter as tk 7 import tkinter as tk
8 +import threading
8 9
9 ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/ping.mp3").raw_data, dtype=np.int16) 10 ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/ping.mp3").raw_data, dtype=np.int16)
10 11
...@@ -25,29 +26,26 @@ window.title("Sound Effect") ...@@ -25,29 +26,26 @@ window.title("Sound Effect")
25 window.geometry("640x400+100+100") 26 window.geometry("640x400+100+100")
26 window.resizable(False, False) 27 window.resizable(False, False)
27 28
28 -info_text = tk.StringVar()
29 -info_text.set("welcome! please press record button.")
30 29
31 -info_label = tk.Button(window, textvariable=info_text, foreground="black", background="white") 30 +info_label = tk.Button(window, width=50, height=10, bg="white", fg="black")
32 info_label.pack() 31 info_label.pack()
33 32
33 +info_text = tk.StringVar()
34 +info_text.set("ready for recording...")
35 +info_label.config(textvariable=info_text)
36 +
34 def set_source_mode(mode): 37 def set_source_mode(mode):
35 global SOURCE_MODE 38 global SOURCE_MODE
36 SOURCE_MODE = mode 39 SOURCE_MODE = mode
37 40
38 # 사용자의 목소리를 duration 초간 녹음. 41 # 사용자의 목소리를 duration 초간 녹음.
39 def get_user_audio(duration): 42 def get_user_audio(duration):
40 - global info_text, info_label 43 + global info_text, info_label, ORIGIN_SOUND
41 frames = [] 44 frames = []
42 p = pyaudio.PyAudio() 45 p = pyaudio.PyAudio()
43 46
44 # 카운터 시작 47 # 카운터 시작
45 -
46 info_text.set("ready for recording...") 48 info_text.set("ready for recording...")
47 - for _ in range(3, 0, -1):
48 - info_text.set(str(_))
49 - time.sleep(1)
50 - info_text.set("start...")
51 49
52 # 실제 녹음 콜백 함수 50 # 실제 녹음 콜백 함수
53 def add_to_frame(in_data, frame_count, time_info, status): 51 def add_to_frame(in_data, frame_count, time_info, status):
...@@ -69,17 +67,19 @@ def get_user_audio(duration): ...@@ -69,17 +67,19 @@ def get_user_audio(duration):
69 stream.close() 67 stream.close()
70 p.terminate() 68 p.terminate()
71 69
70 + info_text.set("start...")
71 +
72 return sound 72 return sound
73 73
74 def record(): 74 def record():
75 - global ORIGIN_SOUND 75 + global ORIGIN_SOUND, SOURCE_MODE, MODE
76 - global SOURCE_MODE 76 +
77 ORIGIN_SOUND = get_user_audio(0.5) 77 ORIGIN_SOUND = get_user_audio(0.5)
78 - SOURCE_MODE = "decibel" # decibel or frequency 78 + SOURCE_MODE = "frequency" # decibel or frequency
79 + MODE = "pitch_shift"
79 80
80 def start(): 81 def start():
81 - global MODE, SOUND_SIZE, sound_idx, sound 82 + global MODE, SOUND_SIZE, sound_idx, sound, ORIGIN_SOUND, last_frame, SOURCE_MODE, MODE
82 - MODE = "high_filter" # echo or pitch_shift
83 SOUND_SIZE = len(ORIGIN_SOUND) # 음원 길이 83 SOUND_SIZE = len(ORIGIN_SOUND) # 음원 길이
84 84
85 sound = ORIGIN_SOUND.copy() 85 sound = ORIGIN_SOUND.copy()
...@@ -92,8 +92,7 @@ def start(): ...@@ -92,8 +92,7 @@ def start():
92 92
93 # 콜백 함수 정의 93 # 콜백 함수 정의
94 def process_audio(in_data, frame_count, time_info, status): 94 def process_audio(in_data, frame_count, time_info, status):
95 - 95 + global sound_idx, sound, last_frame, ORIGIN_SOUND, MODE, SOURCE_MODE
96 -
97 def get_distortion(height, frequency): 96 def get_distortion(height, frequency):
98 height = min(height, MAX_HEIGHT) / MAX_HEIGHT 97 height = min(height, MAX_HEIGHT) / MAX_HEIGHT
99 frequency = min(frequency, MAX_FREQ) / MAX_FREQ 98 frequency = min(frequency, MAX_FREQ) / MAX_FREQ
...@@ -216,6 +215,9 @@ def start(): ...@@ -216,6 +215,9 @@ def start():
216 p.terminate() 215 p.terminate()
217 216
218 217
218 +
219 +
220 +
219 record_button = tk.Button(window, text="Record", width=10, height=2, command=lambda: record()) 221 record_button = tk.Button(window, text="Record", width=10, height=2, command=lambda: record())
220 record_button.pack() 222 record_button.pack()
221 223
...@@ -225,6 +227,7 @@ decibel_button.pack() ...@@ -225,6 +227,7 @@ decibel_button.pack()
225 frequency_button = tk.Button(window, text="Frequency", width=10, height=2, command = lambda: set_source_mode("frequency")) 227 frequency_button = tk.Button(window, text="Frequency", width=10, height=2, command = lambda: set_source_mode("frequency"))
226 frequency_button.pack() 228 frequency_button.pack()
227 229
230 +#start_button = tk.Button(window, text="Start", width=10, height=2, command=lambda: threading.Thread(target=start).start())
228 start_button = tk.Button(window, text="Start", width=10, height=2, command=lambda: start()) 231 start_button = tk.Button(window, text="Start", width=10, height=2, command=lambda: start())
229 start_button.pack() 232 start_button.pack()
230 233
......
1 +import numpy as np
2 +import matplotlib.pyplot as plt
3 +import librosa
4 +import librosa.display
5 +
6 +# Load the audio file
7 +audio_file = "./sounds/s4.mp3"
8 +audio, sr = librosa.load(audio_file, sr=None)
9 +
10 +# Echo parameters
11 +delay = int(0.2 * sr) # Echo delay in samples
12 +decay1 = 0.8 # Echo decay factor 1
13 +decay2 = 0.3 # Echo decay factor 2
14 +
15 +# Apply echo effect with decay factor 1
16 +echoed_audio1 = np.zeros_like(audio)
17 +echoed_audio1[delay:] = audio[:-delay] + decay1 * audio[delay:]
18 +
19 +# Apply echo effect with decay factor 2
20 +echoed_audio2 = np.zeros_like(audio)
21 +echoed_audio2[delay:] = audio[:-delay] + decay2 * audio[delay:]
22 +
23 +# Compute the Mel spectrogram of the original audio
24 +mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
25 +
26 +# Compute the Mel spectrogram of the echoed audio with decay factor 1
27 +mel_spec_echoed1 = librosa.feature.melspectrogram(y=echoed_audio1, sr=sr)
28 +
29 +# Compute the Mel spectrogram of the echoed audio with decay factor 2
30 +mel_spec_echoed2 = librosa.feature.melspectrogram(y=echoed_audio2, sr=sr)
31 +
32 +# Convert to dB scale
33 +mel_spec_orig_db = librosa.power_to_db(mel_spec_orig, ref=np.max)
34 +mel_spec_echoed1_db = librosa.power_to_db(S=mel_spec_echoed1, ref=np.max)
35 +mel_spec_echoed2_db = librosa.power_to_db(S=mel_spec_echoed2, ref=np.max)
36 +
37 +# Display the Mel spectrograms
38 +plt.figure(figsize=(10, 4))
39 +
40 +plt.subplot(1, 2, 1)
41 +librosa.display.specshow(mel_spec_orig_db, sr=sr, x_axis='time', y_axis='mel')
42 +plt.colorbar(format='%+2.0f dB')
43 +plt.title('Original Mel Spectrogram')
44 +
45 +plt.subplot(1, 2, 2)
46 +librosa.display.specshow(mel_spec_echoed1_db, sr=sr, x_axis='time', y_axis='mel')
47 +plt.colorbar(format='%+2.0f dB')
48 +plt.title('Echoed Mel Spectrogram (Decay 1)')
49 +
50 +# plt.subplot(1, 3, 3)
51 +# librosa.display.specshow(mel_spec_echoed2_db, sr=sr, x_axis='time', y_axis='mel')
52 +# plt.colorbar(format='%+2.0f dB')
53 +# plt.title('Echoed Mel Spectrogram (Decay 2)')
54 +
55 +plt.tight_layout()
56 +plt.show()
1 +import numpy as np
2 +import matplotlib.pyplot as plt
3 +import librosa
4 +import librosa.display
5 +from scipy.signal import butter, lfilter
6 +import sounddevice as sd
7 +
8 +def apply_low_pass_filter(data, cutoff_freq, sample_rate, order=5):
9 + print(type(data), data.shape, data.dtype)
10 +
11 + # Calculate the digital cutoff frequency
12 + nyquist_freq = 0.5 * sample_rate
13 + digital_cutoff = cutoff_freq / nyquist_freq
14 +
15 + # Create the filter coefficients using Butterworth filter
16 + b, a = butter(order, digital_cutoff, btype='low', analog=False)
17 +
18 + # Apply the filter to the data
19 + filtered_data = lfilter(b, a, data)
20 +
21 + return filtered_data
22 +
23 +# Load the audio file
24 +audio_file = "./sounds/s4.mp3"
25 +audio, sr = librosa.load(audio_file, sr=None)
26 +
27 +# Filter parameters
28 +cutoff_frequency = 5000 # Cutoff frequency in Hz
29 +sample_rate = sr
30 +
31 +# Apply the low-pass filter
32 +filtered_audio = apply_low_pass_filter(audio, cutoff_frequency, sample_rate)
33 +
34 +# Play the filtered audio
35 +sd.play(filtered_audio, sample_rate)
36 +
37 +# Compute the Mel spectrogram of the original audio
38 +mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
39 +mel_spec_orig_db = librosa.power_to_db(S=mel_spec_orig, ref=np.max)
40 +
41 +# Compute the Mel spectrogram of the filtered audio
42 +mel_spec_filtered = librosa.feature.melspectrogram(y=filtered_audio, sr=sr)
43 +mel_spec_filtered_db = librosa.power_to_db(S=mel_spec_filtered, ref=np.max)
44 +
45 +# Plotting the original and filtered signals
46 +plt.figure(figsize=(12, 8))
47 +
48 +# Original Signal
49 +plt.subplot(2, 2, 1)
50 +plt.plot(audio)
51 +plt.title('Original Signal')
52 +plt.xlabel('Time')
53 +plt.ylabel('Amplitude')
54 +
55 +# Filtered Signal
56 +plt.subplot(2, 2, 2)
57 +plt.plot(filtered_audio)
58 +plt.title('Filtered Signal')
59 +plt.xlabel('Time')
60 +plt.ylabel('Amplitude')
61 +
62 +# Plotting the Mel spectrograms
63 +plt.subplot(2, 2, 3)
64 +librosa.display.specshow(mel_spec_orig_db, sr=sr, x_axis='time', y_axis='mel')
65 +plt.colorbar(format='%+2.0f dB')
66 +plt.title('Original Mel Spectrogram')
67 +
68 +plt.subplot(2, 2, 4)
69 +librosa.display.specshow(mel_spec_filtered_db, sr=sr, x_axis='time', y_axis='mel')
70 +plt.colorbar(format='%+2.0f dB')
71 +plt.title('Filtered Mel Spectrogram')
72 +
73 +plt.tight_layout()
74 +plt.show()
1 +import numpy as np
2 +import librosa
3 +import librosa.display
4 +import matplotlib.pyplot as plt
5 +
6 +# Load the audio file
7 +audio_file = "./sounds/s4.mp3"
8 +audio, sr = librosa.load(audio_file, sr=None)
9 +
10 +# Compute the original Mel spectrogram
11 +mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
12 +
13 +# Pitch shift parameters
14 +pitch_shift_factor = 2.0
15 +
16 +# Apply pitch shifting
17 +audio_pitch_shifted = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=pitch_shift_factor, bins_per_octave=1)
18 +mel_spec_pitch_shifted = librosa.feature.melspectrogram(y=audio_pitch_shifted, sr=sr)
19 +
20 +# Display the original Mel spectrogram
21 +plt.figure(figsize=(10, 4))
22 +plt.subplot(1, 2, 1)
23 +librosa.display.specshow(librosa.power_to_db(mel_spec_orig, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
24 +plt.title('Original Mel Spectrogram')
25 +#plt.colorbar(format='%+2.0f dB')
26 +
27 +# Display the pitch-shifted Mel spectrogram
28 +plt.subplot(1, 2, 2)
29 +librosa.display.specshow(librosa.power_to_db(mel_spec_pitch_shifted, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
30 +plt.title('Pitch-Shifted Mel Spectrogram')
31 +#plt.colorbar(format='%+2.0f dB')
32 +
33 +plt.tight_layout()
34 +plt.show()