analyze {soundgen} | R Documentation |
Acoustic analysis
Description
Acoustic analysis of one or more sounds: pitch tracking, basic spectral
characteristics, formants, estimated loudness (see
getLoudness
), roughness (see modulationSpectrum
),
novelty (see ssm
), etc. The default values of arguments are
optimized for human non-linguistic vocalizations. See
vignette('acoustic_analysis', package = 'soundgen') for details. The defaults
and reasonable ranges of all arguments can be found in
defaults_analyze
. For high-precision work, first extract and
manually correct pitch contours with pitch_app
, PRAAT, or
whatever, and then run analyze(pitchManual = ...)
with these manual
contours.
Usage
analyze(
x,
samplingRate = NULL,
scale = NULL,
from = NULL,
to = NULL,
dynamicRange = 80,
silence = 0.04,
windowLength = 50,
step = windowLength/2,
overlap = 50,
specType = c("spectrum", "reassign", "spectralDerivative")[1],
wn = "gaussian",
zp = 0,
cutFreq = NULL,
nFormants = 3,
formants = list(),
loudness = list(SPL_measured = 70),
roughness = list(windowLength = 15, step = 3, amRes = 10),
novelty = list(input = "melspec", kernelLen = 100),
pitchMethods = c("dom", "autocor"),
pitchManual = NULL,
entropyThres = 0.6,
pitchFloor = 75,
pitchCeiling = 3500,
priorMean = 300,
priorSD = 6,
priorAdapt = TRUE,
nCands = 1,
minVoicedCands = NULL,
pitchDom = list(domThres = 0.1, domSmooth = 220),
pitchAutocor = list(autocorThres = 0.7, autocorSmooth = 7, autocorUpsample = 25,
autocorBestPeak = 0.975, interpol = "sinc", wn = "hanning"),
pitchCep = list(cepThres = 0.75, cepZp = 0),
pitchSpec = list(specThres = 0.05, specPeak = 0.25, specHNRslope = 0.8, specSmooth =
150, specMerge = 0.1, specSinglePeakCert = 0.4, specRatios = 3),
pitchHps = list(hpsNum = 5, hpsThres = 0.1, hpsNorm = 2, hpsPenalty = 2),
pitchZc = list(zcThres = 0.1, zcWin = 5),
harmHeight = list(harmThres = 3, harmTol = 0.25, harmPerSel = 5),
subh = list(method = c("cep", "pitchCands", "harm")[1], nSubh = 5, tol = 0.05, nHarm =
5, harmThres = 12, harmTol = 0.25, amRange = c(10, 200)),
flux = list(thres = 0.15, smoothWin = 100),
amRange = c(10, 200),
fmRange = c(5, 1000/step/2),
shortestSyl = 20,
shortestPause = 60,
interpol = list(win = 75, tol = 0.3, cert = 0.3),
pathfinding = c("none", "fast", "slow")[2],
annealPars = list(maxit = 5000, temp = 1000),
certWeight = 0.5,
snakeStep = 0,
snakePlot = FALSE,
smooth = 1,
smoothVars = c("pitch", "dom"),
summaryFun = c("mean", "median", "sd"),
invalidArgAction = c("adjust", "abort", "ignore")[1],
reportEvery = NULL,
cores = 1,
plot = FALSE,
osc = "linear",
showLegend = TRUE,
savePlots = NULL,
pitchPlot = list(col = rgb(0, 0, 1, 0.75), lwd = 3, showPrior = TRUE),
extraContour = NULL,
ylim = NULL,
xlab = "Time",
ylab = NULL,
main = NULL,
width = 900,
height = 500,
units = "px",
res = NA,
...
)
Arguments
x |
path to a folder, one or more wav or mp3 files c('file1.wav', 'file2.mp3'), Wave object, numeric vector, or a list of Wave objects or numeric vectors |
samplingRate |
sampling rate of |
scale |
maximum possible amplitude of input used for normalization of
input vector (only needed if |
from , to |
if NULL (default), analyzes the whole sound, otherwise from...to (s) |
dynamicRange |
dynamic range, dB. All values more than one dynamicRange under maximum are treated as zero |
silence |
(0 to 1 as proportion of max amplitude) frames with RMS
amplitude below |
windowLength |
length of FFT window, ms |
step |
you can override |
overlap |
overlap between successive FFT frames, % |
specType |
plot the original FFT ('spectrum'), reassigned spectrogram ('reassigned'), or spectral derivative ('spectralDerivative') |
wn |
window type accepted by |
zp |
window length after zero padding, points |
cutFreq |
if specified, spectral descriptives (peakFreq, specCentroid,
specSlope, and quartiles) are calculated only between |
nFormants |
the number of formants to extract per STFT frame (0 = no formant analysis, NULL = as many as possible) |
formants |
a list of arguments passed to
|
loudness |
a list of parameters passed to |
roughness |
a list of parameters passed to
|
novelty |
a list of parameters passed to |
pitchMethods |
methods of pitch estimation to consider for determining pitch contour: 'autocor' = autocorrelation (~PRAAT), 'cep' = cepstral, 'spec' = spectral (~BaNa), 'dom' = lowest dominant frequency band, 'hps' = harmonic product spectrum, NULL = no pitch analysis |
pitchManual |
manually corrected pitch contour. For a single sound,
provide a numeric vector of any length. For multiple sounds, provide a
dataframe with columns "file" and "pitch" (or path to a csv file) as
returned by |
entropyThres |
pitch tracking is only performed for frames with Weiner
entropy below |
pitchFloor , pitchCeiling |
absolute bounds for pitch candidates (Hz) |
priorMean , priorSD |
specifies the mean (Hz) and standard deviation
(semitones) of gamma distribution describing our prior knowledge about the
most likely pitch values for this file. For ex., |
priorAdapt |
adaptive second-pass prior: if TRUE, optimal pitch contours
are estimated first with a prior determined by |
nCands |
maximum number of pitch candidates per method, normally 1...4
(except for |
minVoicedCands |
minimum number of pitch candidates that have to be
defined to consider a frame voiced (if NULL, defaults to 2 if |
pitchDom |
a list of control parameters for pitch tracking using the
lowest dominant frequency band or "dom" method; see details and
|
pitchAutocor |
a list of control parameters for pitch tracking using the
autocorrelation or "autocor" method; see details and
|
pitchCep |
a list of control parameters for pitch tracking using the
cepstrum or "cep" method; see details and |
pitchSpec |
a list of control parameters for pitch tracking using the
BaNa or "spec" method; see details and |
pitchHps |
a list of control parameters for pitch tracking using the
harmonic product spectrum or "hps" method; see details and
|
pitchZc |
a list of control parameters for pitch tracking based on zero
crossings in bandpass-filtered audio or "zc" method; see
|
harmHeight |
a list of control parameters for estimating how high
harmonics reach in the spectrum; see details and |
subh |
a list of control parameters for estimating the strength of
subharmonics per frame - that is, spectral energy at integer ratios of f0:
see |
flux |
a list of control parameters for calculating feature-based flux
(not spectral flux) passed to |
amRange |
target range of frequencies for amplitude modulation, Hz: a
vector of length 2 (affects both |
fmRange |
target range of frequencies for analyzing frequency
modulation, Hz ( |
shortestSyl |
the smallest length of a voiced segment (ms) that constitutes a voiced syllable (shorter segments will be replaced by NA, as if unvoiced) |
shortestPause |
the smallest gap between voiced syllables (ms): large value = interpolate and merge, small value = treat as separate syllables separated by an unvoiced gap |
interpol |
a list of parameters (currently |
pathfinding |
method of finding the optimal path through pitch
candidates: 'none' = best candidate per frame, 'fast' = simple heuristic,
'slow' = annealing. See |
annealPars |
a list of control parameters for postprocessing of
pitch contour with SANN algorithm of |
certWeight |
(0 to 1) in pitch postprocessing, specifies how much we prioritize the certainty of pitch candidates vs. pitch jumps / the internal tension of the resulting pitch curve |
snakeStep |
optimized path through pitch candidates is further
processed to minimize the elastic force acting on pitch contour. To
disable, set |
snakePlot |
if TRUE, plots the snake |
smooth , smoothVars |
if |
summaryFun |
functions used to summarize each acoustic characteristic, eg "c('mean', 'sd')"; user-defined functions are fine (see examples); NAs are omitted automatically for mean/median/sd/min/max/range/sum, otherwise take care of NAs yourself |
invalidArgAction |
what to do if an argument is invalid or outside the
range in |
reportEvery |
when processing multiple inputs, report estimated time left every ... iterations (NULL = default, NA = don't report) |
cores |
number of cores for parallel processing |
plot |
if TRUE, produces a spectrogram with pitch contour overlaid |
osc |
"none" = no oscillogram; "linear" = on the original scale; "dB" = in decibels |
showLegend |
if TRUE, adds a legend with pitch tracking methods |
savePlots |
full path to the folder in which to save the plots (NULL = don't save, ” = same folder as audio) |
pitchPlot |
a list of graphical parameters for displaying the final
pitch contour. Set to |
extraContour |
name of an output variable to overlap on the pitch
contour plot, eg 'peakFreq' or 'loudness'; can also be a list with extra
graphical parameters, eg |
ylim |
frequency range to plot, kHz (defaults to 0 to Nyquist frequency). NB: still in kHz, even if yScale = bark, mel, or ERB |
xlab , ylab , main |
plotting parameters |
width , height , units , res |
parameters passed to
|
... |
other graphical parameters passed to |
Details
Each pitch tracker is controlled by its own list of settings, as follows:
pitchDom
(lowest dominant frequency band)-
domThres
(0 to 1) to find the lowest dominant frequency band, we do short-term FFT and take the lowest frequency with amplitude at least domThresdomSmooth
the width of smoothing interval (Hz) for findingdom
pitchAutocor
(autocorrelation)-
-
autocorThres
voicing threshold (unitless, ~0 to 1) autocorSmooth
the width of smoothing interval (in bins) for finding peaks in the autocorrelation function. Defaults to 7 for sampling rate 44100 and smaller odd numbers for lower values of sampling rate-
autocorUpsample
upsamples acf to this resolution (Hz) to improve accuracy in high frequencies -
autocorBestPeak
amplitude of the lowest best candidate relative to the absolute max of the acf
-
pitchCep
(cepstrum)-
cepThres
voicing threshold (unitless, ~0 to 1) -
cepZp
zero-padding of the spectrum used for cepstral pitch detection (final length of spectrum after zero-padding in points, e.g. 2 ^ 13)
-
-
pitchSpec
(ratio of harmonics - BaNa algorithm) -
-
specThres
voicing threshold (unitless, ~0 to 1) -
specPeak,specHNRslope
when looking for putative harmonics in the spectrum, the threshold for peak detection is calculated asspecPeak * (1 - HNR * specHNRslope)
specSmooth the width of window for detecting peaks in the spectrum, Hz
-
specMerge
pitch candidates withinspecMerge
semitones are merged with boosted certainty -
specSinglePeakCert
(0 to 1) if F0 is calculated based on a single harmonic ratio (as opposed to several ratios converging on the same candidate), its certainty is taken to bespecSinglePeakCert
-
- pitchHps (harmonic product spectrum)
-
hpsNum
the number of times to downsample the spectrum -
hpsThres
voicing threshold (unitless, ~0 to 1) -
hpsNorm
the amount of inflation of hps pitch certainty (0 = none) -
hpsPenalty
the amount of penalizing hps candidates in low frequencies (0 = none)
-
Each of these lists also accepts graphical
parameters that affect how pitch candidates are plotted, eg pitchDom =
list(domThres = .5, col = 'yellow')
. Other arguments that are lists of
subroutine-specific settings include:
harmonicHeight
(finding how high harmonics reach in the spectrum)-
harmThres
minimum height of spectral peak, dB -
harmPerSel
the number of harmonics per sliding selection -
harmTol
maximum tolerated deviation of peak frequency from multiples of f0, proportion of f0
-
Value
Returns a list with $detailed
frame-by-frame descriptives and
a $summary
with one row per file, as determined by summaryFun
(e.g., mean / median / SD of each acoustic variable across all STFT
frames). Output measures include:
- duration
total duration, s
- duration_noSilence
duration from the beginning of the first non-silent STFT frame to the end of the last non-silent STFT frame, s (NB: depends strongly on
windowLength
andsilence
settings)- time
time of the middle of each frame (ms)
- amEnvFreq,amEnvDep
frequency (Hz) and depth (0 to 1) of amplitude modulation estimated from a smoothed amplitude envelope
- amMsFreq,amMsPurity
frequency and purity of amplitude modulation estimated via
modulationSpectrum
- ampl
root mean square of amplitude per frame, calculated as sqrt(mean(frame ^ 2))
- ampl_noSilence
same as
ampl
, but ignoring silent frames- CPP
Cepstral Peak Prominence, dB (a measure of pitch quality, the ratio of the highest peak in the cepstrum to the regression line drawn through it)
- dom
lowest dominant frequency band (Hz) (see "Pitch tracking methods / Dominant frequency" in the vignette)
- entropy
Weiner entropy of the spectrum of the current frame. Close to 0: pure tone or tonal sound with nearly all energy in harmonics; close to 1: white noise
- entropySh
Normalized Shannon entropy of the spectrum of the current frame: 0 = pure tone, 1 = white noise
- f1_freq, f1_width, ...
the frequency and bandwidth of the first nFormants formants per STFT frame, as calculated by phonTools::findformants
- flux
feature-based flux, the rate of change in acoustic features such as pitch, HNR, etc. (0 = none, 1 = max); "epoch" is an audio segment between two peaks of flux that exceed a threshold of
flux = list(thres = ...)
(listed in output$detailed only)- fmFreq
frequency of frequency modulation (FM) such as vibrato or jitter, Hz
- fmDep
depth of FM, semitones
- fmPurity
purity or dominance of the main FM frequency (fmFreq), 0 to 1
- harmEnergy
the amount of energy in upper harmonics, namely the ratio of total spectral mass above 1.25 x F0 to the total spectral mass below 1.25 x F0 (dB)
- harmHeight
how high harmonics reach in the spectrum, based on the best guess at pitch (or the manually provided pitch values)
- HNR
harmonics-to-noise ratio (dB), a measure of harmonicity returned by soundgen:::getPitchAutocor (see "Pitch tracking methods / Autocorrelation"). If HNR = 0 dB, there is as much energy in harmonics as in noise
- loudness
subjective loudness, in sone, corresponding to the chosen SPL_measured - see
getLoudness
- novelty
spectral novelty - a measure of how variable the spectrum is on a particular time scale, as estimated by
ssm
- peakFreq
the frequency with maximum spectral power (Hz)
- pitch
post-processed pitch contour based on all F0 estimates
- quartile25, quartile50, quartile75
the 25th, 50th, and 75th quantiles of the spectrum of voiced frames (Hz)
- roughness
the amount of amplitude modulation, see modulationSpectrum
- specCentroid
the center of gravity of the frame’s spectrum, first spectral moment (Hz)
- specSlope
the slope of linear regression fit to the spectrum below cutFreq (dB/kHz)
- subDep
estimated depth of subharmonics per frame: 0 = none, 1 = as strong as f0. NB: this depends critically on accurate pitch tracking
- subRatio
the ratio of f0 to subharmonics frequency with strength subDep: 2 = period doubling, 3 = f0 / 3, etc.
- voiced
is the current STFT frame voiced? TRUE / FALSE
See Also
pitch_app
getLoudness
segment
getRMS
Examples
sound = soundgen(sylLen = 300, pitch = c(500, 400, 600),
noise = list(time = c(0, 300), value = c(-40, 0)),
temperature = 0.001,
addSilence = 50) # NB: always have some silence before and after!!!
# playme(sound, 16000)
a = analyze(sound, samplingRate = 16000, plot = TRUE)
str(a$detailed) # frame-by-frame
a$summary # summary per sound
## Not run:
# For maximum processing speed (just basic spectral descriptives):
a = analyze(sound, samplingRate = 16000,
plot = FALSE, # no plotting
pitchMethods = NULL, # no pitch tracking
loudness = NULL, # no loudness analysis
novelty = NULL, # no novelty analysis
roughness = NULL, # no roughness analysis
nFormants = 0 # no formant analysis
)
# Take a sound hard to analyze b/c of subharmonics and jitter
sound2 = soundgen(sylLen = 900, pitch = list(
time = c(0, .3, .8, 1), value = c(300, 900, 400, 2300)),
noise = list(time = c(0, 900), value = c(-40, -20)),
subDep = 10, jitterDep = 0.5,
temperature = 0.001, samplingRate = 44100, pitchSamplingRate = 44100)
# playme(sound2, 44100)
a2 = analyze(sound2, samplingRate = 44100, priorSD = 24,
plot = TRUE, ylim = c(0, 5))
# Compare the available pitch trackers
analyze(sound2, 44100,
pitchMethods = c('dom', 'autocor', 'spec', 'cep', 'hps', 'zc'),
# don't use priors to see weird pitch candidates better
priorMean = NA, priorAdapt = FALSE,
plot = TRUE, yScale = 'bark')
# Fancy plotting options:
a = analyze(sound2, samplingRate = 44100, plot = TRUE,
xlab = 'Time, ms', colorTheme = 'seewave',
contrast = .5, ylim = c(0, 4), main = 'My plot',
pitchMethods = c('dom', 'autocor', 'spec', 'hps', 'cep'),
priorMean = NA, # no prior info at all
pitchDom = list(col = 'red', domThres = .25),
pitchPlot = list(col = 'black', pch = 9, lty = 3, lwd = 3),
extraContour = list(x = 'peakFreq', type = 'b', pch = 4, col = 'brown'),
osc = 'dB', heights = c(2, 1))
# Analyze an entire folder in one go, saving spectrograms with pitch contours
# plus an html file for easy access
s2 = analyze('~/Downloads/temp',
savePlots = '', # save in the same folder as audio
showLegend = TRUE, yScale = 'bark',
width = 20, height = 12,
units = 'cm', res = 300, ylim = c(0, 5))
s2$summary[, 1:5]
# Different options for summarizing the output
a = analyze(sound1, 44100,
summaryFun = c('mean', 'range'))
a$summary # one row per sound
# ...with custom summaryFun, eg time of peak relative to duration (0 to 1)
timePeak = function(x) which.max(x) / length(x) # without omitting NAs
timeTrough = function(x) which.min(x) / length(x)
a = analyze(sound2, samplingRate = 16000,
summaryFun = c('mean', 'timePeak', 'timeTrough'))
colnames(a$summary)
# Analyze a selection rather than the whole sound
a = analyze(sound, samplingRate = 16000, from = .1, to = .3, plot = TRUE)
# Use only a range of frequencies when calculating spectral descriptives
# (ignore everything below 100 Hz and above 8000 Hz as irrelevant noise)
a = analyze(sound, samplingRate = 16000, cutFreq = c(100, 8000))
## Amplitude and loudness: analyze() should give the same results as
# dedicated functions getRMS() / getLoudness()
# Create 1 kHz tone
samplingRate = 16000; dur_ms = 50
sound3 = sin(2*pi*1000/samplingRate*(1:(dur_ms/1000*samplingRate)))
a1 = analyze(sound3, samplingRate = samplingRate, scale = 1,
windowLength = 25, overlap = 50,
loudness = list(SPL_measured = 40),
pitchMethods = NULL, plot = FALSE)
a1$detailed$loudness # loudness per STFT frame (1 sone by definition)
getLoudness(sound3, samplingRate = samplingRate, windowLength = 25,
overlap = 50, SPL_measured = 40, scale = 1)$loudness
a1$detailed$ampl # RMS amplitude per STFT frame
getRMS(sound3, samplingRate = samplingRate, windowLength = 25,
overlap = 50, scale = 1)$detailed
# or even simply: sqrt(mean(sound3 ^ 2))
# The same sound as above, but with half the amplitude
a_half = analyze(sound3 / 2, samplingRate = samplingRate, scale = 1,
windowLength = 25, overlap = 50,
loudness = list(SPL_measured = 40),
pitchMethods = NULL, plot = FALSE)
a1$detailed$ampl / a_half$detailed$ampl # rms amplitude halved
a1$detailed$loudness/ a_half$detailed$loudness
# loudness is not a linear function of amplitude
# Analyzing ultrasounds (slow but possible, just adjust pitchCeiling)
s = soundgen(sylLen = 100, addSilence = 10,
pitch = c(25000, 35000, 30000),
formants = NA, rolloff = -12, rolloffKHz = 0,
pitchSamplingRate = 350000, samplingRate = 350000, windowLength = 5,
pitchCeiling = 45000, invalidArgAction = 'ignore')
# s is a bat-like ultrasound inaudible to humans
a = analyze(
s, 350000, plot = TRUE,
pitchFloor = 10000, pitchCeiling = 90000, priorMean = NA,
pitchMethods = c('autocor', 'spec'),
# probably shouldn't use pitchMethod = "dom" b/c of likely low-freq noise
windowLength = 5, step = 2.5,
shortestSyl = 10, shortestPause = 10, # again, very short sounds
interpol = list(win = 10), # again, very short sounds
smooth = 0.1, # might need less smoothing if very rapid f0 changes
nFormants = 0, loudness = NULL, roughness = NULL, novelty = NULL)
# NB: ignore formants and loudness estimates for such non-human sounds
# download 260 sounds from Anikin & Persson (2017)
# http://cogsci.se/publications/anikin-persson_2017_nonlinguistic-vocs/260sounds_wav.zip
# unzip them into a folder, say '~/Downloads/temp'
myfolder = '~/Downloads/temp' # 260 .wav files live here
s = analyze(myfolder) # ~ 10-20 minutes!
# s = write.csv(s, paste0(myfolder, '/temp.csv')) # save a backup
# Check accuracy: import manually verified pitch values (our "key")
# pitchManual # "ground truth" of mean pitch per sound
# pitchContour # "ground truth" of complete pitch contours per sound
files_manual = paste0(names(pitchManual), '.wav')
idx = match(s$file, files_manual) # in case the order is wrong
s$key = pitchManual[idx]
# Compare manually verified mean pitch with the output of analyze:
cor(s$key, s$summary$pitch_median, use = 'pairwise.complete.obs')
plot(s$key, s$summary$pitch_median, log = 'xy')
abline(a=0, b=1, col='red')
# Re-running analyze with manually corrected contours gives correct
pitch-related descriptives like amplVoiced and harmonics (NB: you get it "for
free" when running pitch_app)
s1 = analyze(myfolder, pitchManual = pitchContour)
plot(s$summary$harmonics_median, s1$summary$harmonics_median)
abline(a=0, b=1, col='red')
## End(Not run)