melspectogram showing codes

노현욱
Commit 9ed30dfca2aab99ec373e979d561fb3ad3487e78 9ed30dfc 1 parent 0dda3806
Showing 8 changed files with 213 additions and 39 deletions
find_peak.py → cli_version.py
find_peak copy.py → gui_version.py
sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver2-110924.mp3 → sounds/s2.mp3
sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver3-110925.mp3 → sounds/s3.mp3
sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver4-110926.mp3 → sounds/s4.mp3
spectograms/echo_added_spectogram.py
spectograms/low_filter_spectogram.py
spectograms/pitch_shifted_spectogram.py
--- a/find_peak.py → cli_version.py
View file @9ed30df
+++ b/find_peak.py → cli_version.py
View file @9ed30df
@@ -6,17 +6,21 @@ import time
 import librosa
 import tkinter as tk
 
- ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/ping.mp3").raw_data, dtype=np.int16)
+ ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/s1.mp3").raw_data, dtype=np.int16)
 
 # 파라미터 설정
 RATE = 44100  # 샘플링 주파수
 CHUNK = 1024  # 읽을 샘플의 수
- THRESHOLD = 256  # 피크를 검출하기 위한 threshold 값
+ THRESHOLD = 128  # 피크를 검출하기 위한 threshold 값
 WIN_SIZE = 1024  # STFT를 적용할 윈도우 사이즈
 HOP_SIZE = 512  # STFT에서 윈도우 사이의 거리 (오버랩 사이즈)
 DELAY = 0.1  # Delay time in seconds
- MAX_FREQ = 3000 # max freq for pitch shifting
- MAX_HEIGHT = 2000 # max height for pitch shifting
+ MAX_FREQ = 10000 # max freq for pitch shifting
+ MAX_HEIGHT = 10000 # max height for pitch shifting
+ MAX_DECIBEL = 50 # max decibel for decibel shifting
+ SOURCE_MODE = "decibel" # height, decibel or frequency
+ MODE = "low_filter" # low_filter, echo or pitch_shift
+ SOUND_SIZE = len(ORIGIN_SOUND)  # 음원 길이
 
 sound_idx = 0
 
@@ -56,10 +60,8 @@ def get_user_audio(duration):
 
     return sound
 
- ORIGIN_SOUND = get_user_audio(0.5)
- SOURCE_MODE = "decibel" # decibel or frequency
- MODE = "high_filter" # echo or pitch_shift
- SOUND_SIZE = len(ORIGIN_SOUND)  # 음원 길이
+ if "y" == input("직접 녹음을 하시겠습니까? (y/n) : "):
+     ORIGIN_SOUND = get_user_audio(0.2)
 
 sound = ORIGIN_SOUND.copy()
 
@@ -67,24 +69,27 @@ print(type(sound), len(sound))
 
 p = pyaudio.PyAudio()
 
- last_frame = 0
+ last_time = 0
 
 # 콜백 함수 정의
 def process_audio(in_data, frame_count, time_info, status):
     global buffer
     global sound
     global sound_idx
-     global last_frame
+     global last_time
     
     
-     def get_distortion(height, frequency):
-         height = min(height, MAX_HEIGHT) / MAX_HEIGHT
+     def get_distortion(height, frequency, decibel):
+         height = min(height, MAX_HEIGHT) / RATE
         frequency = min(frequency, MAX_FREQ) / MAX_FREQ
+         decibel = min(decibel, MAX_DECIBEL) / MAX_DECIBEL
 
-         if SOURCE_MODE == "decibel":
+         if SOURCE_MODE == "height":
             param = height
         elif SOURCE_MODE == "frequency":
             param = frequency
+         elif SOURCE_MODE == "decibel":
+             param = decibel
         else:
             return ORIGIN_SOUND
 
@@ -110,16 +115,17 @@ def process_audio(in_data, frame_count, time_info, status):
         return echoed_samples
     
     def shift_pitch(frequency):
-         pitch_shift_factor = frequency
+         pitch_shift_factor = frequency * 3
         audio_array = ORIGIN_SOUND.copy()
         # Resample the audio array to change the pitch
         resampled_array = librosa.effects.pitch_shift(np.array(audio_array, dtype=np.float32), sr=RATE, n_steps=pitch_shift_factor, bins_per_octave=1)
         return np.array(resampled_array, dtype=np.int16)
 
     def low_filter(param):
-         audio_data = data
+         audio_data = np.array(ORIGIN_SOUND.copy(), dtype=np.float32)
         # Define the filter parameters
-         cutoff_freq = param * MAX_FREQ  # Frequency cutoff for the low-pass filter (in Hz)
+         cutoff_freq = param * RATE  # Frequency cutoff for the low-pass filter (in Hz)
+         print("cut of below : ", cutoff_freq)
         nyquist_freq = 0.5 * RATE  # Nyquist frequency (half of the sampling rate)
         normalized_cutoff = cutoff_freq / nyquist_freq  # Normalized cutoff frequency
 
@@ -129,7 +135,7 @@ def process_audio(in_data, frame_count, time_info, status):
         # Apply the low-pass filter to the audio data
         filtered_audio = signal.lfilter(b, a, audio_data)
 
-         return filtered_audio
+         return np.array(filtered_audio, dtype=np.int16)
 
     # 오디오 데이터 변환
     data = np.frombuffer(in_data, dtype=np.int16)
@@ -140,18 +146,19 @@ def process_audio(in_data, frame_count, time_info, status):
     # 피크 검출
     peaks, _ = signal.find_peaks(np.abs(np.mean(Zxx, axis=1)), height=THRESHOLD, distance=WIN_SIZE)
     # 파라미터 추정
-     if len(peaks) > 0 and last_frame+1 != frame_count:
-         last_frame = frame_count
+     if len(peaks) > 0 and last_time+0.1 < time_info['current_time']:
         peak_idx = peaks[0]  # 첫 번째 피크 선택
         height = np.abs(Zxx[peak_idx, 0])  # 피크의 높이 추정
         freq = f[peak_idx]  # 피크의 주파수 추정
         amp = np.max(np.abs(data))  # 신호의 진폭 추정
         decibel = np.mean(librosa.amplitude_to_db(np.abs(Zxx)))  # 진폭을 데시벨로 변환
     
-         if(decibel > 20):
-             print("Height: {:.2f}, 주파수: {:.2f}, Amplitude: {:.2f}, Decibel: {:.2f}".format(height, freq, amp, decibel))
-             new_sound = get_distortion(height, freq)
+         if(decibel > 10) and height > 100:
+             last_time = time_info['current_time']
+             print("Height: {:.2f}, 주파수: {:.2f}, Amplitude: {:.2f}, Decibel: {:.2f}, time_info {:.2f}".format(height, freq, amp, decibel, time_info['current_time']))
+             new_sound = get_distortion(height, freq, decibel)
             if(sound_idx > len(sound)):
+                 sound = new_sound
                 sound_idx = 0
             else:
                 mixed_end = min(len(sound), sound_idx + len(new_sound))
@@ -161,7 +168,7 @@ def process_audio(in_data, frame_count, time_info, status):
                     result = np.concatenate((sound, new_sound[mixed_end-sound_idx:]),axis=0)
                     sound = result
     elif len(peaks) > 0:
-         last_frame = frame_count
+         last_time = time_info['current_time']
 
     sound_idx += 1024
     if sound_idx > len(sound):
--- a/find_peak copy.py → gui_version.py
View file @9ed30df
+++ b/find_peak copy.py → gui_version.py
View file @9ed30df
@@ -5,6 +5,7 @@ import pydub
 import time
 import librosa
 import tkinter as tk
+ import threading
 
 ORIGIN_SOUND = np.frombuffer(pydub.AudioSegment.from_mp3("./sounds/ping.mp3").raw_data, dtype=np.int16)
 
@@ -25,29 +26,26 @@ window.title("Sound Effect")
 window.geometry("640x400+100+100")
 window.resizable(False, False)
 
- info_text = tk.StringVar()
- info_text.set("welcome! please press record button.")
 
- info_label = tk.Button(window, textvariable=info_text, foreground="black", background="white")
+ info_label = tk.Button(window, width=50, height=10, bg="white", fg="black")
 info_label.pack()
 
+ info_text = tk.StringVar()
+ info_text.set("ready for recording...")
+ info_label.config(textvariable=info_text)
+ 
 def set_source_mode(mode):
     global SOURCE_MODE
     SOURCE_MODE = mode
 
 # 사용자의 목소리를 duration 초간 녹음.
 def get_user_audio(duration):
-     global info_text, info_label
+     global info_text, info_label, ORIGIN_SOUND
     frames = []
     p = pyaudio.PyAudio()
 
     # 카운터 시작
- 
     info_text.set("ready for recording...")
-     for _ in range(3, 0, -1):
-         info_text.set(str(_))
-         time.sleep(1)
-     info_text.set("start...")
 
     # 실제 녹음 콜백 함수
     def add_to_frame(in_data, frame_count, time_info, status):
@@ -69,17 +67,19 @@ def get_user_audio(duration):
     stream.close()
     p.terminate()
 
+     info_text.set("start...")
+ 
     return sound
 
 def record():
-     global ORIGIN_SOUND
-     global SOURCE_MODE
+     global ORIGIN_SOUND, SOURCE_MODE, MODE
+ 
     ORIGIN_SOUND = get_user_audio(0.5)
-     SOURCE_MODE = "decibel" # decibel or frequency
+     SOURCE_MODE = "frequency" # decibel or frequency
+     MODE = "pitch_shift"
 
 def start():
-     global MODE, SOUND_SIZE, sound_idx, sound
-     MODE = "high_filter" # echo or pitch_shift
+     global MODE, SOUND_SIZE, sound_idx, sound, ORIGIN_SOUND, last_frame, SOURCE_MODE, MODE
     SOUND_SIZE = len(ORIGIN_SOUND)  # 음원 길이
 
     sound = ORIGIN_SOUND.copy()
@@ -92,8 +92,7 @@ def start():
 
     # 콜백 함수 정의
     def process_audio(in_data, frame_count, time_info, status):
-         
-         
+         global sound_idx, sound, last_frame, ORIGIN_SOUND, MODE, SOURCE_MODE
         def get_distortion(height, frequency):
             height = min(height, MAX_HEIGHT) / MAX_HEIGHT
             frequency = min(frequency, MAX_FREQ) / MAX_FREQ
@@ -216,6 +215,9 @@ def start():
     p.terminate()
 
 
+ 
+ 
+ 
 record_button = tk.Button(window, text="Record", width=10, height=2, command=lambda: record())
 record_button.pack()
 
@@ -225,6 +227,7 @@ decibel_button.pack()
 frequency_button = tk.Button(window, text="Frequency", width=10, height=2, command = lambda: set_source_mode("frequency"))
 frequency_button.pack()
 
+ #start_button = tk.Button(window, text="Start", width=10, height=2, command=lambda: threading.Thread(target=start).start())
 start_button = tk.Button(window, text="Start", width=10, height=2, command=lambda: start())
 start_button.pack()
 
--- a/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver2-110924.mp3 → sounds/s2.mp3
View file @9ed30df
+++ b/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver2-110924.mp3 → sounds/s2.mp3
View file @9ed30df
--- a/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver3-110925.mp3 → sounds/s3.mp3
View file @9ed30df
+++ b/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver3-110925.mp3 → sounds/s3.mp3
View file @9ed30df
--- a/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver4-110926.mp3 → sounds/s4.mp3
View file @9ed30df
+++ b/sounds/funny-sound-effect-for-quotjack-in-the-boxquot-sound-ver4-110926.mp3 → sounds/s4.mp3
View file @9ed30df
--- a/spectograms/echo_added_spectogram.py 0 → 100644
View file @9ed30df
+++ b/spectograms/echo_added_spectogram.py 0 → 100644
View file @9ed30df
+ import numpy as np
+ import matplotlib.pyplot as plt
+ import librosa
+ import librosa.display
+ 
+ # Load the audio file
+ audio_file = "./sounds/s4.mp3"
+ audio, sr = librosa.load(audio_file, sr=None)
+ 
+ # Echo parameters
+ delay = int(0.2 * sr)  # Echo delay in samples
+ decay1 = 0.8  # Echo decay factor 1
+ decay2 = 0.3  # Echo decay factor 2
+ 
+ # Apply echo effect with decay factor 1
+ echoed_audio1 = np.zeros_like(audio)
+ echoed_audio1[delay:] = audio[:-delay] + decay1 * audio[delay:]
+ 
+ # Apply echo effect with decay factor 2
+ echoed_audio2 = np.zeros_like(audio)
+ echoed_audio2[delay:] = audio[:-delay] + decay2 * audio[delay:]
+ 
+ # Compute the Mel spectrogram of the original audio
+ mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
+ 
+ # Compute the Mel spectrogram of the echoed audio with decay factor 1
+ mel_spec_echoed1 = librosa.feature.melspectrogram(y=echoed_audio1, sr=sr)
+ 
+ # Compute the Mel spectrogram of the echoed audio with decay factor 2
+ mel_spec_echoed2 = librosa.feature.melspectrogram(y=echoed_audio2, sr=sr)
+ 
+ # Convert to dB scale
+ mel_spec_orig_db = librosa.power_to_db(mel_spec_orig, ref=np.max)
+ mel_spec_echoed1_db = librosa.power_to_db(S=mel_spec_echoed1, ref=np.max)
+ mel_spec_echoed2_db = librosa.power_to_db(S=mel_spec_echoed2, ref=np.max)
+ 
+ # Display the Mel spectrograms
+ plt.figure(figsize=(10, 4))
+ 
+ plt.subplot(1, 2, 1)
+ librosa.display.specshow(mel_spec_orig_db, sr=sr, x_axis='time', y_axis='mel')
+ plt.colorbar(format='%+2.0f dB')
+ plt.title('Original Mel Spectrogram')
+ 
+ plt.subplot(1, 2, 2)
+ librosa.display.specshow(mel_spec_echoed1_db, sr=sr, x_axis='time', y_axis='mel')
+ plt.colorbar(format='%+2.0f dB')
+ plt.title('Echoed Mel Spectrogram (Decay 1)')
+ 
+ # plt.subplot(1, 3, 3)
+ # librosa.display.specshow(mel_spec_echoed2_db, sr=sr, x_axis='time', y_axis='mel')
+ # plt.colorbar(format='%+2.0f dB')
+ # plt.title('Echoed Mel Spectrogram (Decay 2)')
+ 
+ plt.tight_layout()
+ plt.show()
--- a/spectograms/low_filter_spectogram.py 0 → 100644
View file @9ed30df
+++ b/spectograms/low_filter_spectogram.py 0 → 100644
View file @9ed30df
+ import numpy as np
+ import matplotlib.pyplot as plt
+ import librosa
+ import librosa.display
+ from scipy.signal import butter, lfilter
+ import sounddevice as sd
+ 
+ def apply_low_pass_filter(data, cutoff_freq, sample_rate, order=5):
+     print(type(data), data.shape, data.dtype)
+ 
+     # Calculate the digital cutoff frequency
+     nyquist_freq = 0.5 * sample_rate
+     digital_cutoff = cutoff_freq / nyquist_freq
+ 
+     # Create the filter coefficients using Butterworth filter
+     b, a = butter(order, digital_cutoff, btype='low', analog=False)
+ 
+     # Apply the filter to the data
+     filtered_data = lfilter(b, a, data)
+ 
+     return filtered_data
+ 
+ # Load the audio file
+ audio_file = "./sounds/s4.mp3"
+ audio, sr = librosa.load(audio_file, sr=None)
+ 
+ # Filter parameters
+ cutoff_frequency = 5000  # Cutoff frequency in Hz
+ sample_rate = sr
+ 
+ # Apply the low-pass filter
+ filtered_audio = apply_low_pass_filter(audio, cutoff_frequency, sample_rate)
+ 
+ # Play the filtered audio
+ sd.play(filtered_audio, sample_rate)
+ 
+ # Compute the Mel spectrogram of the original audio
+ mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
+ mel_spec_orig_db = librosa.power_to_db(S=mel_spec_orig, ref=np.max)
+ 
+ # Compute the Mel spectrogram of the filtered audio
+ mel_spec_filtered = librosa.feature.melspectrogram(y=filtered_audio, sr=sr)
+ mel_spec_filtered_db = librosa.power_to_db(S=mel_spec_filtered, ref=np.max)
+ 
+ # Plotting the original and filtered signals
+ plt.figure(figsize=(12, 8))
+ 
+ # Original Signal
+ plt.subplot(2, 2, 1)
+ plt.plot(audio)
+ plt.title('Original Signal')
+ plt.xlabel('Time')
+ plt.ylabel('Amplitude')
+ 
+ # Filtered Signal
+ plt.subplot(2, 2, 2)
+ plt.plot(filtered_audio)
+ plt.title('Filtered Signal')
+ plt.xlabel('Time')
+ plt.ylabel('Amplitude')
+ 
+ # Plotting the Mel spectrograms
+ plt.subplot(2, 2, 3)
+ librosa.display.specshow(mel_spec_orig_db, sr=sr, x_axis='time', y_axis='mel')
+ plt.colorbar(format='%+2.0f dB')
+ plt.title('Original Mel Spectrogram')
+ 
+ plt.subplot(2, 2, 4)
+ librosa.display.specshow(mel_spec_filtered_db, sr=sr, x_axis='time', y_axis='mel')
+ plt.colorbar(format='%+2.0f dB')
+ plt.title('Filtered Mel Spectrogram')
+ 
+ plt.tight_layout()
+ plt.show()
--- a/spectograms/pitch_shifted_spectogram.py 0 → 100644
View file @9ed30df
+++ b/spectograms/pitch_shifted_spectogram.py 0 → 100644
View file @9ed30df
+ import numpy as np
+ import librosa
+ import librosa.display
+ import matplotlib.pyplot as plt
+ 
+ # Load the audio file
+ audio_file = "./sounds/s4.mp3"
+ audio, sr = librosa.load(audio_file, sr=None)
+ 
+ # Compute the original Mel spectrogram
+ mel_spec_orig = librosa.feature.melspectrogram(y=audio, sr=sr)
+ 
+ # Pitch shift parameters
+ pitch_shift_factor = 2.0
+ 
+ # Apply pitch shifting
+ audio_pitch_shifted = librosa.effects.pitch_shift(y=audio, sr=sr, n_steps=pitch_shift_factor, bins_per_octave=1)
+ mel_spec_pitch_shifted = librosa.feature.melspectrogram(y=audio_pitch_shifted, sr=sr)
+ 
+ # Display the original Mel spectrogram
+ plt.figure(figsize=(10, 4))
+ plt.subplot(1, 2, 1)
+ librosa.display.specshow(librosa.power_to_db(mel_spec_orig, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
+ plt.title('Original Mel Spectrogram')
+ #plt.colorbar(format='%+2.0f dB')
+ 
+ # Display the pitch-shifted Mel spectrogram
+ plt.subplot(1, 2, 2)
+ librosa.display.specshow(librosa.power_to_db(mel_spec_pitch_shifted, ref=np.max), sr=sr, x_axis='time', y_axis='mel')
+ plt.title('Pitch-Shifted Mel Spectrogram')
+ #plt.colorbar(format='%+2.0f dB')
+ 
+ plt.tight_layout()
+ plt.show()