melspectogram showing codes

노현욱
Commit 9ed30dfca2aab99ec373e979d561fb3ad3487e78 9ed30dfc 1 parent 0dda3806
Showing 8 changed files with 213 additions and 39 deletions
find_peak.py → cli_version.py
find_peak copy.py → gui_version.py
sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver2-110924.mp3 → sounds/s2.mp3
sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver3-110925.mp3 → sounds/s3.mp3
sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver4-110926.mp3 → sounds/s4.mp3
spectograms/echo_added_spectogram.py
spectograms/low_filter_spectogram.py
spectograms/pitch_shifted_spectogram.py
--- a/find_peak.py → cli_version.py
View file @9ed30df
+++ b/find_peak.py → cli_version.py
View file @9ed30df
@@ -6,17 +6,21 @@ import time
 import librosa
 import tkinter as tk
-ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/ping.mp3").raw_data, dtype=np.int16)
+ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/s1.mp3").raw_data, dtype=np.int16)
 # 파라미터 설정
 RATE = 44100  # 샘플링 주파수
 CHUNK = 1024  # 읽을 샘플의 수
-THRESHOLD = 256  # 피크를 검출하기 위한 threshold 값
+THRESHOLD = 128  # 피크를 검출하기 위한 threshold 값
 WIN_SIZE = 1024  # STFT를 적용할 윈도우 사이즈
 HOP_SIZE = 512  # STFT에서 윈도우 사이의 거리 (오버랩 사이즈)
 DELAY = 0.1  # Delay time in seconds
-MAX_FREQ = 3000 # max freq for pitch shifting
+MAX_FREQ = 10000 # max freq for pitch shifting
-MAX_HEIGHT = 2000 # max height for pitch shifting
+MAX_HEIGHT = 10000 # max height for pitch shifting
+MAX_DECIBEL = 50 # max decibel for decibel shifting
+SOURCE_MODE = "decibel" # height, decibel or frequency
+MODE = "low_filter" # low_filter, echo or pitch_shift
+SOUND_SIZE = len(ORIGIN_SOUND)  # 음원 길이
 sound_idx = 0
@@ -56,10 +60,8 @@ def get_user_audio(duration):
     return sound
-ORIGIN_SOUND = get_user_audio(0.5)
+if "y" == input("직접 녹음을 하시겠습니까? (y/n) : "):
-SOURCE_MODE = "decibel" # decibel or frequency
+    ORIGIN_SOUND = get_user_audio(0.2)
-MODE = "high_filter" # echo or pitch_shift
-SOUND_SIZE = len(ORIGIN_SOUND)  # 음원 길이
 sound = ORIGIN_SOUND.copy()
@@ -67,24 +69,27 @@ print(type(sound), len(sound))
 p = pyaudio.PyAudio()
-last_frame = 0
+last_time = 0
 # 콜백 함수 정의
 def process_audio(in_data, frame_count, time_info, status):
     global buffer
     global sound
     global sound_idx
-    global last_frame
+    global last_time
-    def get_distortion(height, frequency):
+    def get_distortion(height, frequency, decibel):
-        height = min(height, MAX_HEIGHT) / MAX_HEIGHT
+        height = min(height, MAX_HEIGHT) / RATE
         frequency = min(frequency, MAX_FREQ) / MAX_FREQ
+        decibel = min(decibel, MAX_DECIBEL) / MAX_DECIBEL
-        if SOURCE_MODE == "decibel":
+        if SOURCE_MODE == "height":
             param = height
         elif SOURCE_MODE == "frequency":
             param = frequency
+        elif SOURCE_MODE == "decibel":
+            param = decibel
         else:
             return ORIGIN_SOUND
@@ -110,16 +115,17 @@ def process_audio(in_data, frame_count, time_info, status):
         return echoed_samples
     def shift_pitch(frequency):
-        pitch_shift_factor = frequency
+        pitch_shift_factor = frequency * 3
         audio_array = ORIGIN_SOUND.copy()
         # Resample the audio array to change the pitch
         resampled_array = librosa.effects.pitch_shift(np.array(audio_array, dtype=np.float32), sr=RATE, n_steps=pitch_shift_factor, bins_per_octave=1)
         return np.array(resampled_array, dtype=np.int16)
     def low_filter(param):
-        audio_data = data
+        audio_data = np.array(ORIGIN_SOUND.copy(), dtype=np.float32)
         # Define the filter parameters
-        cutoff_freq = param * MAX_FREQ  # Frequency cutoff for the low-pass filter (in Hz)
+        cutoff_freq = param * RATE  # Frequency cutoff for the low-pass filter (in Hz)
+        print("cut of below : ", cutoff_freq)
         nyquist_freq = 0.5 * RATE  # Nyquist frequency (half of the sampling rate)
         normalized_cutoff = cutoff_freq / nyquist_freq  # Normalized cutoff frequency
@@ -129,7 +135,7 @@ def process_audio(in_data, frame_count, time_info, status):
         # Apply the low-pass filter to the audio data
         filtered_audio = signal.lfilter(b, a, audio_data)
-        return filtered_audio
+        return np.array(filtered_audio, dtype=np.int16)
     # 오디오 데이터 변환
     data = np.frombuffer(in_data, dtype=np.int16)
@@ -140,18 +146,19 @@ def process_audio(in_data, frame_count, time_info, status):
     # 피크 검출
     peaks, _ = signal.find_peaks(np.abs(np.mean(Zxx, axis=1)), height=THRESHOLD, distance=WIN_SIZE)
     # 파라미터 추정
-    if len(peaks) > 0 and last_frame+1 != frame_count:
+    if len(peaks) > 0 and last_time+0.1 < time_info['current_time']:
-        last_frame = frame_count
         peak_idx = peaks[0]  # 첫 번째 피크 선택
         height = np.abs(Zxx[peak_idx, 0])  # 피크의 높이 추정
         freq = f[peak_idx]  # 피크의 주파수 추정
         amp = np.max(np.abs(data))  # 신호의 진폭 추정
         decibel = np.mean(librosa.amplitude_to_db(np.abs(Zxx)))  # 진폭을 데시벨로 변환
-        if(decibel > 20):
+        if(decibel > 10) and height > 100:
-            print("Height: {:.2f}, 주파수: {:.2f}, Amplitude: {:.2f}, Decibel: {:.2f}".format(height, freq, amp, decibel))
+            last_time = time_info['current_time']
-            new_sound = get_distortion(height, freq)
+            print("Height: {:.2f}, 주파수: {:.2f}, Amplitude: {:.2f}, Decibel: {:.2f}, time_info {:.2f}".format(height, freq, amp, decibel, time_info['current_time']))
+            new_sound = get_distortion(height, freq, decibel)
             if(sound_idx > len(sound)):
+                sound = new_sound
                 sound_idx = 0
             else:
                 mixed_end = min(len(sound), sound_idx + len(new_sound))
@@ -161,7 +168,7 @@ def process_audio(in_data, frame_count, time_info, status):
                     result = np.concatenate((sound, new_sound[mixed_end-sound_idx:]),axis=0)
                     sound = result
     elif len(peaks) > 0:
-        last_frame = frame_count
+        last_time = time_info['current_time']
     sound_idx += 1024
     if sound_idx > len(sound):
--- a/find_peak copy.py → gui_version.py
View file @9ed30df
+++ b/find_peak copy.py → gui_version.py
View file @9ed30df
@@ -5,6 +5,7 @@ import pydub
 import time
 import librosa
 import tkinter as tk
+import threading
 ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/ping.mp3").raw_data, dtype=np.int16)
@@ -25,29 +26,26 @@ window.title("Sound Effect")
 window.geometry("640x400+100+100")
 window.resizable(False, False)
-info_text = tk.StringVar()
-info_text.set("welcome! please press record button.")
-info_label = tk.Button(window, textvariable=info_text, foreground="black", background="white")
+info_label = tk.Button(window, width=50, height=10, bg="white", fg="black")
 info_label.pack()
+info_text = tk.StringVar()
+info_text.set("ready for recording...")
+info_label.config(textvariable=info_text)
+
 def set_source_mode(mode):
     global SOURCE_MODE
     SOURCE_MODE = mode
 # 사용자의 목소리를 duration 초간 녹음.
 def get_user_audio(duration):
-    global info_text, info_label
+    global info_text, info_label, ORIGIN_SOUND
     frames = []
     p = pyaudio.PyAudio()
     # 카운터 시작
-
     info_text.set("ready for recording...")
-    for _ in range(3, 0, -1):
-        info_text.set(str(_))
-        time.sleep(1)
-    info_text.set("start...")
     # 실제 녹음 콜백 함수
     def add_to_frame(in_data, frame_count, time_info, status):
@@ -69,17 +67,19 @@ def get_user_audio(duration):
     stream.close()
     p.terminate()
+    info_text.set("start...")
+
     return sound
 def record():
-    global ORIGIN_SOUND
+    global ORIGIN_SOUND, SOURCE_MODE, MODE
-    global SOURCE_MODE
+
     ORIGIN_SOUND = get_user_audio(0.5)
-    SOURCE_MODE = "decibel" # decibel or frequency
+    SOURCE_MODE = "frequency" # decibel or frequency
+    MODE = "pitch_shift"
 def start():
-    global MODE, SOUND_SIZE, sound_idx, sound
+    global MODE, SOUND_SIZE, sound_idx, sound, ORIGIN_SOUND, last_frame, SOURCE_MODE, MODE
-    MODE = "high_filter" # echo or pitch_shift
     SOUND_SIZE = len(ORIGIN_SOUND)  # 음원 길이
     sound = ORIGIN_SOUND.copy()
@@ -92,8 +92,7 @@ def start():
     # 콜백 함수 정의
     def process_audio(in_data, frame_count, time_info, status):
-        
+        global sound_idx, sound, last_frame, ORIGIN_SOUND, MODE, SOURCE_MODE
-        
         def get_distortion(height, frequency):
             height = min(height, MAX_HEIGHT) / MAX_HEIGHT
             frequency = min(frequency, MAX_FREQ) / MAX_FREQ
@@ -216,6 +215,9 @@ def start():
     p.terminate()
+
+
+
 record_button = tk.Button(window, text="Record", width=10, height=2, command=lambda: record())
 record_button.pack()
@@ -225,6 +227,7 @@ decibel_button.pack()
 frequency_button = tk.Button(window, text="Frequency", width=10, height=2, command = lambda: set_source_mode("frequency"))
 frequency_button.pack()
+#start_button = tk.Button(window, text="Start", width=10, height=2, command=lambda: threading.Thread(target=start).start())
 start_button = tk.Button(window, text="Start", width=10, height=2, command=lambda: start())
 start_button.pack()
--- a/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver2-110924.mp3 → sounds/s2.mp3
View file @9ed30df
+++ b/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver2-110924.mp3 → sounds/s2.mp3
View file @9ed30df
--- a/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver3-110925.mp3 → sounds/s3.mp3
View file @9ed30df
+++ b/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver3-110925.mp3 → sounds/s3.mp3
View file @9ed30df
--- a/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver4-110926.mp3 → sounds/s4.mp3
View file @9ed30df
+++ b/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver4-110926.mp3 → sounds/s4.mp3
View file @9ed30df
--- a/spectograms/echo_added_spectogram.py 0 → 100644
View file @9ed30df
+++ b/spectograms/echo_added_spectogram.py 0 → 100644
View file @9ed30df
+import numpy as np
+import matplotlib.pyplot as plt
+import librosa
+import librosa.display
+
+# Load the audio file
+audio_file = "./sounds/s4.mp3"
+audio, sr = librosa.load(audio_file, sr=None)
+
+# Echo parameters
+delay = int(0.2 * sr)  # Echo delay in samples
+decay1 = 0.8  # Echo decay factor 1
+decay2 = 0.3  # Echo decay factor 2
+
+# Apply echo effect with decay factor 1
+echoed_audio1 = np.zeros_like(audio)
+echoed_audio1[delay:] = audio[:-delay] + decay1 * audio[delay:]
+
+# Apply echo effect with decay factor 2
+echoed_audio2 = np.zeros_like(audio)
+echoed_audio2[delay:] = audio[:-delay] + decay2 * audio[delay:]
+
+# Compute the Mel spectrogram of the original audio
+mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
+
+# Compute the Mel spectrogram of the echoed audio with decay factor 1
+mel_spec_echoed1 = librosa.feature.melspectrogram(y=echoed_audio1, sr=sr)
+
+# Compute the Mel spectrogram of the echoed audio with decay factor 2
+mel_spec_echoed2 = librosa.feature.melspectrogram(y=echoed_audio2, sr=sr)
+
+# Convert to dB scale
+mel_spec_orig_db = librosa.power_to_db(mel_spec_orig, ref=np.max)
+mel_spec_echoed1_db = librosa.power_to_db(S=mel_spec_echoed1, ref=np.max)
+mel_spec_echoed2_db = librosa.power_to_db(S=mel_spec_echoed2, ref=np.max)
+
+# Display the Mel spectrograms
+plt.figure(figsize=(10, 4))
+
+plt.subplot(1, 2, 1)
+librosa.display.specshow(mel_spec_orig_db, sr=sr, x_axis='time', y_axis='mel')
+plt.colorbar(format='%+2.0f dB')
+plt.title('Original Mel Spectrogram')
+
+plt.subplot(1, 2, 2)
+librosa.display.specshow(mel_spec_echoed1_db, sr=sr, x_axis='time', y_axis='mel')
+plt.colorbar(format='%+2.0f dB')
+plt.title('Echoed Mel Spectrogram (Decay 1)')
+
+# plt.subplot(1, 3, 3)
+# librosa.display.specshow(mel_spec_echoed2_db, sr=sr, x_axis='time', y_axis='mel')
+# plt.colorbar(format='%+2.0f dB')
+# plt.title('Echoed Mel Spectrogram (Decay 2)')
+
+plt.tight_layout()
+plt.show()
--- a/spectograms/low_filter_spectogram.py 0 → 100644
View file @9ed30df
+++ b/spectograms/low_filter_spectogram.py 0 → 100644
View file @9ed30df
+import numpy as np
+import matplotlib.pyplot as plt
+import librosa
+import librosa.display
+from scipy.signal import butter, lfilter
+import sounddevice as sd
+
+def apply_low_pass_filter(data, cutoff_freq, sample_rate, order=5):
+    print(type(data), data.shape, data.dtype)
+
+    # Calculate the digital cutoff frequency
+    nyquist_freq = 0.5 * sample_rate
+    digital_cutoff = cutoff_freq / nyquist_freq
+
+    # Create the filter coefficients using Butterworth filter
+    b, a = butter(order, digital_cutoff, btype='low', analog=False)
+
+    # Apply the filter to the data
+    filtered_data = lfilter(b, a, data)
+
+    return filtered_data
+
+# Load the audio file
+audio_file = "./sounds/s4.mp3"
+audio, sr = librosa.load(audio_file, sr=None)
+
+# Filter parameters
+cutoff_frequency = 5000  # Cutoff frequency in Hz
+sample_rate = sr
+
+# Apply the low-pass filter
+filtered_audio = apply_low_pass_filter(audio, cutoff_frequency, sample_rate)
+
+# Play the filtered audio
+sd.play(filtered_audio, sample_rate)
+
+# Compute the Mel spectrogram of the original audio
+mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
+mel_spec_orig_db = librosa.power_to_db(S=mel_spec_orig, ref=np.max)
+
+# Compute the Mel spectrogram of the filtered audio
+mel_spec_filtered = librosa.feature.melspectrogram(y=filtered_audio, sr=sr)
+mel_spec_filtered_db = librosa.power_to_db(S=mel_spec_filtered, ref=np.max)
+
+# Plotting the original and filtered signals
+plt.figure(figsize=(12, 8))
+
+# Original Signal
+plt.subplot(2, 2, 1)
+plt.plot(audio)
+plt.title('Original Signal')
+plt.xlabel('Time')
+plt.ylabel('Amplitude')
+
+# Filtered Signal
+plt.subplot(2, 2, 2)
+plt.plot(filtered_audio)
+plt.title('Filtered Signal')
+plt.xlabel('Time')
+plt.ylabel('Amplitude')
+
+# Plotting the Mel spectrograms
+plt.subplot(2, 2, 3)
+librosa.display.specshow(mel_spec_orig_db, sr=sr, x_axis='time', y_axis='mel')
+plt.colorbar(format='%+2.0f dB')
+plt.title('Original Mel Spectrogram')
+
+plt.subplot(2, 2, 4)
+librosa.display.specshow(mel_spec_filtered_db, sr=sr, x_axis='time', y_axis='mel')
+plt.colorbar(format='%+2.0f dB')
+plt.title('Filtered Mel Spectrogram')
+
+plt.tight_layout()
+plt.show()
--- a/spectograms/pitch_shifted_spectogram.py 0 → 100644
View file @9ed30df
+++ b/spectograms/pitch_shifted_spectogram.py 0 → 100644
View file @9ed30df
+import numpy as np
+import librosa
+import librosa.display
+import matplotlib.pyplot as plt
+
+# Load the audio file
+audio_file = "./sounds/s4.mp3"
+audio, sr = librosa.load(audio_file, sr=None)
+
+# Compute the original Mel spectrogram
+mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
+
+# Pitch shift parameters
+pitch_shift_factor = 2.0
+
+# Apply pitch shifting
+audio_pitch_shifted = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=pitch_shift_factor, bins_per_octave=1)
+mel_spec_pitch_shifted = librosa.feature.melspectrogram(y=audio_pitch_shifted, sr=sr)
+
+# Display the original Mel spectrogram
+plt.figure(figsize=(10, 4))
+plt.subplot(1, 2, 1)
+librosa.display.specshow(librosa.power_to_db(mel_spec_orig, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
+plt.title('Original Mel Spectrogram')
+#plt.colorbar(format='%+2.0f dB')
+
+# Display the pitch-shifted Mel spectrogram
+plt.subplot(1, 2, 2)
+librosa.display.specshow(librosa.power_to_db(mel_spec_pitch_shifted, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
+plt.title('Pitch-Shifted Mel Spectrogram')
+#plt.colorbar(format='%+2.0f dB')
+
+plt.tight_layout()
+plt.show()