<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="review-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMH</journal-id>
      <journal-id journal-id-type="nlm-ta">JMIR Ment Health</journal-id>
      <journal-title>JMIR Mental Health</journal-title>
      <issn pub-type="epub">2368-7959</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v12i1e74260</article-id>
      <article-id pub-id-type="pmid">41027025</article-id>
      <article-id pub-id-type="doi">10.2196/74260</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Review</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Review</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Speech Emotion Recognition in Mental Health: Systematic Review of Voice-Based Applications</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Torous</surname>
            <given-names>John</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Triantafyllopoulos</surname>
            <given-names>Andreas</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ogunsakin</surname>
            <given-names>Jamiu</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ogun</surname>
            <given-names>Sewade</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Br</surname>
            <given-names>Chandrashekar</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Soluoku</surname>
            <given-names>Talha</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Potla</surname>
            <given-names>Ravi Teja</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Jordan</surname>
            <given-names>Eric</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-4145-6433</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Terrisse</surname>
            <given-names>Raphaël</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-5864-2679</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Lucarini</surname>
            <given-names>Valeria</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3553-8818</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Alrahabi</surname>
            <given-names>Motasem</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5478-4283</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Krebs</surname>
            <given-names>Marie-Odile</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4715-9890</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Desclés</surname>
            <given-names>Julien</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-3984-4760</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Lemey</surname>
            <given-names>Christophe</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>URCI Mental Health Department</institution>
            <institution>Brest Medical University Hospital</institution>
            <addr-line>Maison des adolescents, Hôpital Morvan, CHU Brest</addr-line>
            <addr-line>5 avenue Foch</addr-line>
            <addr-line>Brest, 29200</addr-line>
            <country>France</country>
            <phone>33 0229020020</phone>
            <email>christophe.lemey@chu-brest.fr</email>
          </address>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7308-7958</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>ObTIC</institution>
        <institution>Sorbonne Université</institution>
        <addr-line>Paris</addr-line>
        <country>France</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>URCI Mental Health Department</institution>
        <institution>Brest Medical University Hospital</institution>
        <addr-line>Brest</addr-line>
        <country>France</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>EA 7479 SPURBO</institution>
        <institution>Université de Bretagne Occidentale</institution>
        <addr-line>Brest</addr-line>
        <country>France</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Université Paris Cité, Institute of Psychiatry and Neuroscience of Paris (IPNP), INSERM U1266, team “Pathophysiology of Psychiatric disorders :Development and Vulnerability”</institution>
        <addr-line>Paris</addr-line>
        <country>France</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>GHU-Paris Psychiatrie et Neurosciences, Hôpital Sainte Anne, Evaluation, Prevention and Therapeutic Innovation Department, F-75014</institution>
        <addr-line>Paris</addr-line>
        <country>France</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>IMT Atlantique, Lab-STICC, UMR CNRS 6285, F-29238</institution>
        <addr-line>Brest</addr-line>
        <country>France</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Christophe Lemey <email>christophe.lemey@chu-brest.fr</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>30</day>
        <month>9</month>
        <year>2025</year>
      </pub-date>
      <volume>12</volume>
      <elocation-id>e74260</elocation-id>
      <history>
        <date date-type="received">
          <day>24</day>
          <month>3</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>25</day>
          <month>4</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>20</day>
          <month>6</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>22</day>
          <month>6</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Eric Jordan, Raphaël Terrisse, Valeria Lucarini, Motasem Alrahabi, Marie-Odile Krebs, Julien Desclés, Christophe Lemey. Originally published in JMIR Mental Health (https://mental.jmir.org), 30.09.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Mental Health, is properly cited. The complete bibliographic information, a link to the original publication on https://mental.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://mental.jmir.org/2025/1/e74260" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The field of speech emotion recognition (SER) encompasses a wide variety of approaches, with artificial intelligence technologies providing improvements in recent years. In the domain of mental health, the links between individuals’ emotional states and pathological diagnoses are of particular interest.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to investigate the performance of tools combining SER and artificial intelligence approaches with a view to their use within clinical contexts and to determine the extent to which SER technologies have already been applied within clinical contexts.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>The review includes studies applied to speech (audio) signals for a select set of pathologies or disorders and only includes those studies that evaluate diagnostic performance using machine learning performance metrics or statistical correlation measures. The PubMed, IEEE Xplore, arXiv, and ScienceDirect databases were queried as recently as February 2025. The Quality Assessment of Diagnostic Accuracy Studies tool was used to measure the risk of bias.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>A total of 14 articles were included in the final review. The included papers addressed suicide risk (3/14, 21%), depression (8/14, 57%), and psychotic disorders (3/14, 21%).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>SER technologies are mostly used indirectly in mental health research and in a wide variety of ways, including different architectures, datasets, and pathologies. This diversity makes a direct assessment of the technology challenging. Nonetheless, promising results are obtained in various studies that attempt to diagnose patients based on either indirect or direct results from SER models. These results highlight the potential for this technology to be used within a clinical setting. Future work should focus on how clinicians can use these technologies collaboratively.</p>
        </sec>
        <sec sec-type="trial registration">
          <title>Trial Registration</title>
          <p>PROSPERO CRD420251006669; https://www.crd.york.ac.uk/PROSPERO/view/CRD420251006669</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>affective computing</kwd>
        <kwd>machine learning</kwd>
        <kwd>mental health</kwd>
        <kwd>psychology</kwd>
        <kwd>psychiatry</kwd>
        <kwd>speech emotion recognition</kwd>
        <kwd>voice</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Emotions play a pivotal role in human interaction and communication, influencing various aspects of social exchange, decision-making, and overall well-being. While emotions can be expressed through multiple modalities, including facial expressions, body language, and gestures, human speech remains one of the most prominent and accessible channels for conveying emotional states.</p>
        <p>Intonation, rhythm, pitch, and other acoustic features of speech convey subtle emotional cues, reflecting an individual’s psychological well-being [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. In recent years, there has been increasing interest in leveraging advancements in machine learning (ML) to analyze and interpret emotional cues from speech, a field known as speech emotion recognition (SER). Interest in the automated detection of mental disorders through vocal features is growing, particularly in the context of mental health assessment and monitoring. The ability to automatically analyze and interpret emotional cues from speech offers several advantages for improving patient care, enabling early detection of mental health issues, and enhancing the overall health care experience [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>].</p>
        <p>To better understand the foundations of SER and its evolving landscape, it is important to consider its historical development, theoretical models, and methodological challenges.</p>
      </sec>
      <sec>
        <title>History of SER</title>
        <p>The field of SER originated in the 1990s, when it emerged as a subsection within the field of speech processing. Initial work approached the task by extracting acoustic features from recordings and then performing statistical analysis using various algorithms to derive correlations between the extracted features and the emotional state of the speaker [<xref ref-type="bibr" rid="ref7">7</xref>]. The original papers published on the topic suggested that the automatic detection of emotions could be applied in the context of human-machine interaction.</p>
        <p>These initial publications led to further interest within the field and prompted questions such as the modeling of emotions and the contrast between acted and nonacted emotions. The choice of how to represent emotions is a nontrivial issue and has historically required coordination with the field of psychology to ensure the model used is both compatible with the machines being used for automatic recognition and consistent with the psychological literature on emotions [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        <p>The models typically used fall into 2 groups. First, <italic>categorical models</italic> represent emotions as separate “classes” (eg, happy, sad, and angry). The “big six” model proposed by Ekman et al [<xref ref-type="bibr" rid="ref9">9</xref>] is a well-known example, encompassing happiness, sadness, anger, fear, disgust, and surprise. A more nuanced extension of this categorical approach is the model developed by Plutchik [<xref ref-type="bibr" rid="ref10">10</xref>], which expands the core emotions by incorporating trust and anticipation, forming an 8–primary-emotion framework. The model also introduces the concept of emotional intensity and relationships between emotions, visualized in the “wheel of emotions” (<xref rid="figure1" ref-type="fig">Figure 1</xref>), where primary emotions blend to form more complex emotional states. Subsequent research has often adapted emotional classifications by adding new categories, removing some, or merging similar emotions into a single class.</p>
        <p>Second, <italic>dimensional</italic> or <italic>continuous</italic> models interpret emotions as degrees of some underlying features. These models typically involve plotting emotions on 2 or 3 axes, the most common of which is the 2-dimensional model of arousal and valence [<xref ref-type="bibr" rid="ref11">11</xref>]. In this paradigm, valence reflects the positive or negative quality of emotion. Happiness has high valence, while sadness has low valence. Arousal reflects the level of physiological activation or intensity associated with emotion. For example, surprise and anger typically have high arousal, while sadness and contentment often have low arousal (<xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
        <p>Much of the existing work in SER predominantly relies on the 6-scale model developed by Ekman and the 8-scale model developed by Plutchik [<xref ref-type="bibr" rid="ref10">10</xref>], as they offer well-defined categories that facilitate annotation and classification [<xref ref-type="bibr" rid="ref12">12</xref>]. In contrast, dimensional approaches, which conceptualize emotions along continuous axes, remain comparatively underexplored. However, the dimensional perspective provides a more nuanced framework for capturing emotional complexity and extends beyond speech processing to psychopathology.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>The wheel of emotions proposed by Plutchik.</p>
          </caption>
          <graphic xlink:href="mental_v12i1e74260_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>The circumplex model of emotion with its 2 axes: valence and arousal.</p>
          </caption>
          <graphic xlink:href="mental_v12i1e74260_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>From Initial Works to Challenges</title>
        <p>By the late 2000s, the field of SER had garnered more widespread interest in speech processing, with the development of a wide variety of feature extraction methods and ML algorithms. However, these advancements gave rise to several problems, most notably a lack of comparability among results.</p>
        <p>Facing these problems in the late 2000s, researchers in the field initiated a series of SER challenges, including the INTERSPEECH 2009 Emotion Challenge [<xref ref-type="bibr" rid="ref13">13</xref>]. These competitions aimed to standardize key aspects of the SER process, such as datasets, feature extraction methods, and algorithms, to facilitate the generation of comparable results within a unified context. One significant outcome of these efforts was the development of the open-source Speech and Music Interpretation by Large-Space Extraction (openSMILE) tool for feature extraction [<xref ref-type="bibr" rid="ref14">14</xref>].</p>
      </sec>
      <sec>
        <title>Arrival of Neural Networks to SER</title>
        <p>With the introduction of convolutional neural networks in automatic speech recognition in 2012 [<xref ref-type="bibr" rid="ref15">15</xref>] through their application to spectrograms, these algorithms were quickly adopted for use in SER contexts [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
        <p>Following trends in both natural language processing (NLP) and automatic speech recognition, various neural network (NN) architectures were also applied. These methods ranged from training NN models directly on SER-related data to training on large quantities of unrelated speech data and then fine-tuning on SER datasets.</p>
      </sec>
      <sec>
        <title>Convergence Between SER and Mental Health</title>
        <p>Just as emotions can be viewed from a categorical or dimensional perspective, these approaches can also be applied to mental health disorders. Traditional diagnostic classifications, for example, the <italic>Diagnostic and Statistical Manual of Mental Disorders</italic> (<italic>DSM</italic>) and the <italic>International Classification of Diseases</italic>, adopt a categorical view of mental disorders, which has long served as the standard reference, despite this approach not always being relevant to clinical issues. However, these models come with some limitations when applied to clinical issues. The shift toward viewing disorders as existing along a continuum rather than within rigid diagnostic categories is exemplified by the research domain criteria (RDoC) classification in psychiatry, developed by the National Institute of Mental Health. The RDoC marks a significant evolution in the approach to mental disorders by focusing on biological, cognitive, and behavioral criteria rather than just clinical symptoms. Thus, the RDoC classification proposes a dimensional approach that aims to investigate the underlying processes of mental disorders by addressing transnosographic domains, such as emotion, cognition, or motivation [<xref ref-type="bibr" rid="ref17">17</xref>]. This shift from categorical classification to a dimensional approach leads to the study of psychiatric disorders as spectrums of dysfunction rather than as distinct entities.</p>
      </sec>
      <sec>
        <title>Operational Definition of Direct and Indirect SER</title>
        <p>For the purposes of this review, we distinguish between direct and indirect SER approaches as applied in mental health research. This distinction refers to whether emotion is explicitly modeled and analyzed as a task in its own right or implicitly captured through emotionally relevant features. Direct SER refers to approaches in which emotion recognition is an explicit step in the analysis pipeline. This typically involves training or fine-tuning models on emotion-labeled datasets or applying pretrained SER systems to detect emotional states in speech recordings. The detected emotions are then analyzed in relation to mental health conditions. Indirect SER, by contrast, involves the use of features, models, or techniques that capture emotional characteristics of speech without explicitly recognizing or classifying emotions. For example, acoustic features used in SER may be extracted and used in models for mental health classification, even though emotion labels are not used at any point. These features carry emotional information, but emotion itself is not the target variable.</p>
        <p>Given the role of emotion recognition in identifying psychiatric disorders and the increasing interest in applying SER within clinical psychiatry, where vocal emotional cues offer a noninvasive and objective window into patients’ mental states, this study examines the specific role of emotion recognition in identifying psychiatric disorders. To provide context, we review the advancements, applications, and challenges of SER in health care, highlighting its potential in mental health monitoring, suicide prevention, and diagnosis of mood and psychotic disorders. This review emphasizes the advantages of SER, including its noninvasive nature, objective assessment capabilities, and potential for automated analysis, while addressing key challenges, such as the need for diverse datasets, interpretability of ML models, and generalization across populations.</p>
        <p>This paper is structured as follows: in the first section, we introduce the history and advancements in SER. Subsequently, we review the available SER datasets, followed by an overview of existing applications of speech- and emotion-related technologies in clinical settings. Finally, we discuss current challenges, limitations, and prospects for collaboration between the fields of SER and mental health.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Preferred Reporting Items for Systematic Reviews and Meta-Analyses Review</title>
        <p>Recognizing and analyzing emotions is an essential tool for the clinician. The fields of psychiatry and psychology have long recognized that learning and mastering this skill is at the very core of the diagnostic and therapeutic process, not only for health care professionals but also for patients. However, this clinical skill is neither infallible nor sufficient for systematic precision diagnosis. Several studies have focused on the automated recognition of emotions using a variety of computational techniques. What can these new techniques offer to clinicians in the analysis and interpretation of emotions?</p>
        <p>To answer these questions, we conducted a systematic review of literature following the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines, registering our review in the PROSPERO system (1006669). This review aims to cover studies adopting an acoustic- or speech-related analysis of mental health questions, with a particular focus on works involving an analysis of the emotional aspect of speech when studies paid attention to the emotional dimension.</p>
      </sec>
      <sec>
        <title>Inclusion and Exclusion Criteria</title>
        <p>The inclusion criteria for this review are described in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
        <boxed-text id="box1" position="float">
          <title>Study inclusion and exclusion criteria.</title>
          <p>
            <bold>Inclusion criteria</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Studies containing an analysis of speech (ie, audio) signal</p>
            </list-item>
            <list-item>
              <p>Studies containing either a direct or indirect emotion recognition component. Indirect components include analyses that could be interpreted from an emotional perspective, for example, the use of the open-source Speech and Music Interpretation by Large-Space Extraction (openSMILE) feature sets widely used in speech emotion recognition tasks</p>
            </list-item>
            <list-item>
              <p>Speech data was derived from a clinical context</p>
            </list-item>
          </list>
          <p>
            <bold>Exclusion criteria</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Audio was only analyzed in conjunction with another modality (eg, text).</p>
            </list-item>
            <list-item>
              <p>No diagnosis or prognosis aspect was included, for example, analyzing emotions in isolation without a correlation to patient conditions or outcomes.</p>
            </list-item>
            <list-item>
              <p>The pathology examined could be classified as a neurological disorder rather than a mental health disorder, for example, Alzheimer disease.</p>
            </list-item>
            <list-item>
              <p>The study was a review with no experimental component.</p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
      <sec>
        <title>Search Strategy and Screening Process</title>
        <p>Database queries were performed using the PubMed, IEEE Xplore, arXiv, and ScienceDirect databases up until February 2025, with the following keyword search: (“emotion recognition” OR “affective computing” OR “emotional analysis”) AND (“psychiatry” OR “psychology”) AND (“speech” OR “voice”).</p>
        <p>During the screening process, 2 authors applied the eligibility criteria and selected the studies to be included in the systematic review. In case of doubt, it was established that the article would be submitted to the rest of the review group, and no such cases were encountered.</p>
        <p>An initial screening based on the title and abstract of the search results was performed, removing any articles that did not meet the inclusion criteria.</p>
        <p>From this initial screening, the remaining articles were assessed to determine whether their design included a direct evaluation of the models’ performance in a diagnostic task, measured by either ML metrics (<italic>F</italic><sub>1</sub>-score, accuracy, and area under the curve [AUC]) or statistical correlation measures. Most articles excluded in this step did not include an audio component, did not evaluate the clinical diagnostic performance of their model, or applied their model to patients whose pathology was outside the scope of this review (sometimes analyzing only healthy control participants).</p>
        <p>After filtering based on these criteria, a total of 14 studies were retained. The Quality Assessment of Diagnostic Accuracy Studies–2 framework was used to estimate the risk of bias in these selected studies.</p>
        <p>The Results section is organized in subsections to enhance the reader’s understanding, particularly regarding the methodological context of SER. First, we provide an overview of the methods used within SER and several existing databases. We then present the results of the systematic review.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>PRISMA Review Results</title>
        <p>As outlined in the flowchart in <xref rid="figure3" ref-type="fig">Figure 3</xref>, a total of 3648 studies were screened, with 85 (2.33%) reports retrieved and assessed. From these 85 studies, 14 (20%) were included in the final review, with the most common reasons for exclusion being the lack of speech analysis alone (ie, models using only text or a combination of audio and other modalities) or the absence of a diagnostic perspective within the study (ie, no prediction of pathological severity or comparison between patients and controls). Of the 14 studies included in the final selection, 3 (18%) addressed suicide risk and suicidal ideation (SI), 8 (53%) analyzed depression and mood disorders, and 3 (18%) studied psychotic disorders.</p>
        <p>Several studies screened did not analyze the emotional state of the patients but examined their capacity to detect emotional expressions in others, that is, to determine whether their pathologies impacted their perception of emotional expressions.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Flow diagram of the study selection process based on the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guidelines.</p>
          </caption>
          <graphic xlink:href="mental_v12i1e74260_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Risk of Bias Assessment</title>
        <p>Most of the included studies had a low risk of bias across all fields. As shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>, the main exception was in patient selection, where 5 studies were deemed to have a high risk of bias. Among these 5 studies, 3 selected participants only from within clinical populations (ie, no control group), and the remaining 2 did not select from a representative sample [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref21">21</xref>]. In addition, high concerns regarding the applicability of patient selection were noted in 2 cases [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. 4 studies raised unclear concerns regarding patient selection [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. The full Quality Assessment of Diagnostic Accuracy Studies–2 results are available in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>This section is divided into 2 parts. First, we present a narrative review of established methodologies within the field of SER. Second, the results of the PRISMA review are presented.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>The Quality Assessment of Diagnostic Accuracy Studies 2 (QUADAS-2) results, highlighting risk of bias and applicability concerns. Patient selection was associated with a high risk of bias owing to sampling methods and sample size.</p>
          </caption>
          <graphic xlink:href="mental_v12i1e74260_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Narrative Review</title>
        <p>Here, we present a brief overview of the methods and technical architectures used within the field of SER, covering the transition from traditional ML methods to NNs and state-of-the-art transformer-based methods [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
      </sec>
      <sec>
        <title>Established Methodologies</title>
        <p>Speech, serving as a primary mode of communication, harbors a wealth of emotional cues that can be harnessed to assess an individual’s mental and emotional well-being. Various methodologies have been devised to extract, analyze, and interpret these cues, each providing distinct perspectives on the underlying emotional states.</p>
        <sec>
          <title>Acoustic or Prosodic Feature Extraction</title>
          <p>Acoustic or prosodic feature extraction involves the analysis of various acoustic properties of speech signals, such as pitch, intensity, duration, and spectral characteristics [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Acoustic features are related to the physical properties of sound waves and can be observed at the level of milliseconds, for example, frequency, which is related to how humans perceive pitch. Prosodic features are typically observed over a longer timescale, with pitch contours, for example, measuring how pitch changes over time, and other measures, including speech rate and pauses. Both sets of features serve as the basis for quantifying emotional cues present in speech and are commonly used as input for ML models. Techniques such as mel-frequency cepstral coefficients (MFCCs), pitch contour analysis, and formant analysis are commonly used in acoustic feature extraction for emotion detection tasks [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
          <p>These measures correlate with various aspects of speech production, which can be impacted by the emotional state of the speaker. MFCCs, for example, capture the spectral characteristics of speech (ie, how energy is distributed across different frequencies), which can be linked to certain emotions (eg, higher energy in mid to high frequencies for anger or joy). Pitch contour analysis tracks variation in the fundamental frequency of speech, that is, the perceived pitch, over time. Variation in a speaker’s pitch allows listeners to distinguish between sadness and excitement. Finally, formant values are influenced by the shape and tension of the vocal tract during speech. Physiological aspects of emotions (eg, smiling) may impact these values.</p>
        </sec>
        <sec>
          <title>The OpenSMILE Toolkit</title>
          <p>Among the most prevalent tools used for acoustic feature extraction is the <italic>openSMILE</italic> toolkit [<xref ref-type="bibr" rid="ref14">14</xref>]. This toolkit is available in both a Windows executable (.exe) file and a Python-accessible package and contains several widely used collections of acoustic and prosodic measures (referred to as feature sets). These feature sets encompass a variety of acoustic characteristics, such as frequency, loudness, MFCCs, and other speech attributes. These features can then be analyzed using the methods presented in subsequent sections to detect emotions or, in the context of computational paralinguistics, to aid in the recognition of mental health conditions. Among its strongest advantages are the ease of use for feature extraction and the standardization of feature sets, allowing for more straightforward comparison between results.</p>
        </sec>
        <sec>
          <title>Traditional ML Approaches (Before NNs)</title>
          <p>Traditional ML approaches involve the use of statistical and pattern recognition algorithms to classify emotional states based on extracted features. These approaches typically include algorithms such as support vector machines (SVM), k-nearest neighbor (KNN), and decision trees [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. By training these models on labeled datasets of emotional speech samples, they can learn to classify new instances into predefined emotional categories.</p>
        </sec>
        <sec>
          <title>Recurrent NNs: Long Short-Term Memory and Bidirectional Long Short-Term Memory</title>
          <p>Recurrent NNs, particularly Long Short-Term Memory (LSTM) and bidirectional LSTM architectures, gained popularity in speech emotion detection tasks because of their ability to capture temporal dependencies in sequential data [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref34">34</xref>]. These NNs excel at processing time-series data, such as speech signals, allowing them to capture long-range dependencies and subtle temporal patterns in emotional expression.</p>
        </sec>
        <sec>
          <title>Transformers</title>
          <p>Transformers, originally developed for NLP tasks, have emerged as the state-of-the-art approach for analyzing speech signals for emotion detection [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref39">39</xref>]. These architectures, including models such as the Bidirectional Encoder Representations from Transformers, excel at capturing contextual information and semantic relationships within sequential data. By fine-tuning pretrained transformer models on emotion detection tasks, researchers can leverage their powerful language understanding capabilities to analyze emotional content in speech. One study involved the use of transformer, specifically, a self-attention-based deep learning (DL) model combining a 2-dimensional convolutional neural network and an LSTM network. This model focuses on optimizing feature extraction from speech using MFCCs and achieved an impressive average test accuracy of 90% [<xref ref-type="bibr" rid="ref40">40</xref>].</p>
        </sec>
      </sec>
      <sec>
        <title>Available Datasets</title>
        <p>A multitude of datasets for the SER task exist, each falling into one of several categories based on certain characteristics (eg, the type of data collected, the classification of emotions, or the relation to other tasks). However, few to no datasets are available that serve both SER and mental health applications. This section provides an overview of some of the most widely used datasets and their characteristics.</p>
        <sec>
          <title>Distress Analysis Interview Corpus–Wizard of Oz</title>
          <p>The Distress Analysis Interview Corpus–Wizard of Oz (DAIC-WOZ) dataset is part of the larger Distress Analysis Interview Corpus [<xref ref-type="bibr" rid="ref41">41</xref>]. It includes audio and transcript recordings of semistructured clinical interviews between participants and an interviewer. The interviews were designed to detect signs of depression, anxiety, and posttraumatic stress disorder. Using scores from several questionnaires related to psychological distress and current mood, the audio and transcript data can be used for a classification of the interviewees.</p>
          <p>While the dataset does not explicitly include annotations of the emotional states of the interviewees, labels could be derived using another SER model to directly analyze correlations between the participants’ emotional and mental states. A total of 193 interviews are available, with each interview lasting between 5 and 20 minutes, and all the interviews were conducted in North American English.</p>
        </sec>
        <sec>
          <title>Ryerson Audio-Visual Database of Emotional Speech and Song</title>
          <p>Ryerson Audio-Visual Database of Emotional Speech and Song is a multimodal database (including both audio and face recordings), containing both acted speech and song [<xref ref-type="bibr" rid="ref42">42</xref>]. The recordings, in North American English, feature 24 professional actors (12 male and 12 female actors).</p>
          <p>Each expression (ie, emotion) is produced at 2 levels of emotional intensity, as well as at a neutral level. The dataset comprises a total of 7365 recordings (4320 speech and 3036 singing).</p>
        </sec>
        <sec>
          <title>FAU-AIBO Dataset</title>
          <p>The FAU-AIBO dataset consists of spontaneous speech from German children’s interaction with a human-controlled robot, which they were told was autonomous [<xref ref-type="bibr" rid="ref43">43</xref>].</p>
          <p>The data is annotated on the word level for the presence of 11 emotional states, as evaluated by 5 judges.</p>
          <p>This dataset was used in the initial 2009 INTERSPEECH SER Challenge [<xref ref-type="bibr" rid="ref13">13</xref>]. Recent work has shown that this dataset presents a significant challenge in achieving strong classification performance, even for state-of-the-art methods [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
        </sec>
        <sec>
          <title>Interactive Emotional Dyadic Motion Capture Dataset</title>
          <p>The Interactive Emotional Dyadic Motion Capture dataset is a multimodal resource, including both audio and motion capture data, allowing analysis of both speech and facial expressions, and gestures [<xref ref-type="bibr" rid="ref44">44</xref>]. The dataset comprises scripted speech as well as improvised scenarios, all produced by actors who were specifically instructed to produce or elicit certain emotions.</p>
          <p>Both categorical and dimensional labeling approaches are used. The dataset contains approximately 12 hours of data.</p>
        </sec>
        <sec>
          <title>Canadian French Emotional Dataset</title>
          <p>The Canadian French Emotional dataset is a Canadian French emotional speech dataset [<xref ref-type="bibr" rid="ref45">45</xref>]. It includes recordings from 6 male and 6 female actors reading 6 different sentences in the 6 basic emotional states described by Ekman and in a neutral state, containing approximately 69 minutes of recordings in total. While this dataset is smaller than many other available datasets, it highlights how few resources exist for languages other than English.</p>
        </sec>
      </sec>
      <sec>
        <title>PRISMA Selected Studies</title>
        <p>The selected studies are presented in the subsequent sections, categorized based on mental health pathologies. Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> provides a full overview of the included studies. This overview summarizes key characteristics such as methodology (eg, NNs, transformer models, or acoustic feature–based ML). Several trends can be observed, including the predominance of English and Mandarin datasets and the frequent use of LSTM and transformer models in recent years. The table also shows the potential risk of bias owing to population selection, as highlighted in the QUADAS assessment earlier. The details of each included study are presented in the subsequent sections.</p>
        <sec>
          <title>Suicide Risk and SI</title>
          <p>Gerczuk et al [<xref ref-type="bibr" rid="ref19">19</xref>] explored differences in speech based on sex in the context of suicide risk, using both interpretable (ie, acoustic) and deep features. The authors reported the best results using an emotion fine-tuned wav2vec 2.0 model, achieving 81% balanced accuracy (ie, the model had an 81% chance of making a correct prediction for a given example, accounting for differences in sample sizes between groups) in high- versus low-risk suicide classification. Notably, this result was achieved by training the model separately for each sex. The authors reported a difference in the relationship between acoustic measures and suicide risk across sexes—agitation in male individuals was associated with increased suicide risk, whereas the opposite was true in female individuals—explaining the advantage of training the model separately for each group.</p>
          <p>Another study analyzed interviews from patients who were recently discharged from the hospital after experiencing SI or other circumstances (ie, patients who had attempted suicide, those receiving psychiatric care, and healthy control participants) [<xref ref-type="bibr" rid="ref20">20</xref>]. Two separate experiments were conducted. First, SER classifiers were trained and evaluated using acoustic features based on self-reported Positive and Negative Affect Schedule emotion labels from the interviews. Second, a comparison of the SI group and the other groups was performed based on the variability of the reported emotions.</p>
          <p>The authors reported a maximum AUC of 0.78 when classifying the different emotion labels, and an AUC of 0.79 was achieved using the variability between these emotional states to distinguish between participants with SI and the other groups. The authors noted that the SI group showed lower emotional variability compared with the other groups, indicating that emotional states can be a good indicator of discrimination between control and pathological groups. These scores demonstrate a good level of discrimination between the groups, with AUC indicating the capacity of a model to differentiate between a randomly selected positive case and a randomly selected negative case (an AUC &#62;0.7 indicates the model is achieving a fair discrimination rate, whereas an AUC of 0.5 is equivalent to random chance).</p>
          <p>Suicidal ideas among US veterans were investigated by Belouali et al [<xref ref-type="bibr" rid="ref3">3</xref>]. The study extracted a wide range of features (acoustic, prosodic, and linguistic) from recordings of veterans and trained several models (eg, random forest, logistic regression, and deep NNs). Feature selection was applied to identify the most relevant features for each model. The best results in classifying veterans with SI from their nonsuicidal counterparts were obtained using a combination of acoustic and linguistic features, achieving a sensitivity of 0.86, specificity of 0.70, and AUC of 0.80. The voices of individuals with SI differed from those of their nonsuicidal counterparts regarding energy (lower SD of energy contours in voiced segments, lower kurtosis, and lower skewness), indicating a flatter and less animated voice. Individuals with SI were also found to have more monotonous voices.</p>
          <p>Overall, studies addressing suicide risk and SI demonstrated good discrimination between the patients with SI and control groups (AUC approximately 0.8 and accuracy approximately 80%), indicating that these methods can prove useful in a clinical context to identify patients at high risk of committing suicide.</p>
        </sec>
        <sec>
          <title>Depression and Mood Disorders</title>
          <p>Using both the emotional labels as well as the Patient Health Questionnaire-8 (PHQ-8) depression scores for each participant, Wang et al [<xref ref-type="bibr" rid="ref6">6</xref>] attempted to find a link between speech features and depression scores as well as depression status (ie, participants with depression vs control). Several models were used: from traditional ML approaches (SVM and random forest) to transformer-based approaches (combining several complex models). The best performance was achieved by this complex transformer-based model, reaching an accuracy of 77% (ie, a 77% chance of assigning a given participant to the correct group) and an <italic>F</italic><sub>1</sub>-score of 0.63, indicating the capability of complex models to outperform more traditional ML approaches on the same dataset.</p>
          <p>Yang et al [<xref ref-type="bibr" rid="ref46">46</xref>] highlighted how confusion between patients with low-mood bipolar depression and those with unipolar depression can lead to patients not receiving appropriate treatment. The patients were shown videos to elicit various emotions (happy, sad, disgust, fear, surprise, and anger) and asked to respond aloud to several questions. The recordings were labeled (with emotion profiles, ie, probabilities of each emotion for a given recording) using an SVM classifier trained on the eNTERFACE dataset. Different classifiers were then trained to distinguish between bipolar, unipolar, and healthy control groups based on these emotional profiles. The model (a combination of both LSTM and bidirectional LSTM architectures) achieved a classification accuracy of 77% when distinguishing among the 3 groups.</p>
          <p>Correlations were found between vocal prosody measures and change in the measures of the Hamilton Rating Scale for Depression over the course of 21 weeks, as outlined in Yang et al [<xref ref-type="bibr" rid="ref47">47</xref>]. The participants were diagnosed with major depressive disorder according to the <italic>DSM-IV</italic> guidelines, and the measures of switching pause (ie, the time between utterances from the patient and the interviewer) and fundamental frequency were taken from interviews conducted throughout the duration of the study. When analyzing within-subject variation in depression scores, the authors found that as depression severity decreased, pause duration became shorter and less variable, accounting for 32% of the overall variation over time in the patient’s depression scores. Furthermore, the depression scores obtained using hierarchical linear modeling, with linear discriminant classifiers reaching an accuracy of 69.5% when determining depression severity.</p>
          <p>Stepanov et al [<xref ref-type="bibr" rid="ref48">48</xref>] addressed the question of determining depression severity in the 2017 Audio-Visual Emotion Challenge using the DAIC-WOZ dataset. This challenge involved predicting PHQ-8 scores based on the patients’ speech. The best performance was achieved using low-level acoustic features extracted through openSMILE to train an LSTM model.</p>
          <p>Extracting prosodic features (glottal flow, voice quality, and spectral features) from the DAIC-WOZ dataset, Mao et al [<xref ref-type="bibr" rid="ref49">49</xref>] trained a range of DL models, with a hybrid model achieving an impressive accuracy of 98.7% and an <italic>F</italic><sub>1</sub>-score of 0.987, indicating that the model could almost perfectly distinguish between the control and depression groups.</p>
          <p>By applying the transformer architecture to frequency-related parameters from both the DAIC-WOZ dataset and their own proprietary data, Yang et al [<xref ref-type="bibr" rid="ref50">50</xref>] achieved maximum <italic>F</italic><sub>1</sub>-scores of 0.78 and 0.87, respectively. By analyzing the frequency components most important to their model’s predictions, the authors found that the frequency range from 600-700 Hz was the most important, corresponding to the Mandarin vowel /e/ or /ê/. The authors presented this as a potential biomarker for depression.</p>
          <p>Studies applied to depression and mood disorders were the most prevalent among the results of this study (8 of the 14 selected articles). A range of different results were achieved, with newer methods showing strong improvement compared with older works (an accuracy of 69% in 2013 using only 2 prosodic features [<xref ref-type="bibr" rid="ref47">47</xref>] vs 98% in 2023 using a range of prosodic features and DL methods [<xref ref-type="bibr" rid="ref49">49</xref>]). These findings suggest that automated methods can be useful to clinicians for diagnostic purposes (notably for determining severity) and may help guide clinicians in adapting their therapeutic approaches.</p>
        </sec>
        <sec>
          <title>Psychotic Disorders</title>
          <p>Chakraborty et al [<xref ref-type="bibr" rid="ref51">51</xref>] used the <italic>emobase</italic> feature set of low-level descriptors from openSMILE to discriminate between a group of patients with schizophrenia and a group of healthy controls, based on an interview conducted with a psychologist. In addition, a negative symptom assessment 16 (NSA-16) score, ranging from 1 to 6, was assigned by the psychologist to indicate the severity of negative symptoms of schizophrenia, that is, mainly motivational and emotional impairments. Prediction of these scores was also evaluated. A range of classifiers was trained on the data after conducting feature selection. The best-performing classifier for patient versus control classification was a linear SVM classifier using principal component analysis feature selection, reaching 79.49% accuracy (compared with 66.67% for a classifier predicting the majority class). For negative symptom assessment 16 prediction, classifiers were trained for each of the different items, with accuracy ranging from 62% to 85%. The best-performing classifiers were SVMs, KNN, and decision trees, with a range of different feature selection techniques.</p>
          <p>Using a proprietary dataset of patients with schizophrenia with and without formal thought disorder (FTD), first-degree relatives, and neurotypical controls (15 of each; 60 total), Çokal et al [<xref ref-type="bibr" rid="ref18">18</xref>] elicited spontaneous speech using an image description task. Pauses were measured in the participants’ speech and classified based on the duration of pause, the presence or absence of a filler word (eg, <italic>umm</italic> and <italic>ehh</italic>), and the syntactic context in which the pause occurred.</p>
          <p>Patients without FTD produced significantly more unfilled pauses (ie, pauses without a filler word) than the controls in both utterance-initial contexts and before embedded clauses, and had more pauses before embedded clauses compared with patients with FTD. When compared with the control and first-degree relative groups, patients with FTD produced longer utterance-initial pauses.</p>
          <p>The extended Geneva Minimalistic Acoustic Parameter Set (eGeMAPS) [<xref ref-type="bibr" rid="ref52">52</xref>], available through openSMILE, was used by de Boer et al [<xref ref-type="bibr" rid="ref53">53</xref>] along with a random forest classifier to distinguish between patients with schizophrenia spectrum disorders and healthy controls. The model achieved a classification accuracy of 86% between healthy controls and patients and an accuracy of 74% among patients between those with negative symptoms and those with positive symptoms. They also indicated that their work was a positive step toward validating language features as biomarkers in psychiatry.</p>
          <p>Once again, the results from these studies applied in the context of psychosis performed well when distinguishing between groups, as well as among subgroups of patients with schizophrenia. In a clinical context, these methods can be of notable use for patient screening and early detection, areas in which clinicians often struggle to clearly orient their patients. It is worth noting that none of the works included in the reviewed sample involved a direct analysis of emotions in the context of psychotic disorders.</p>
        </sec>
      </sec>
      <sec>
        <title>Overview of Biomarkers in Mental Health Prediction</title>
        <p>Several of the studies included in this review propose that the acoustic measures used could be considered as speech-derived biomarkers for mental health prediction. These biomarkers span the prosodic, spectral, and temporal domains and are often extracted using standardized tools, such as the openSMILE toolkit.</p>
        <sec>
          <title>Prosodic and Temporal Markers</title>
          <p>Markers, such as pitch (F0), energy, pause patterns, and speech rate, were commonly explored. For example, shorter and less variable within-subject pauses were associated with increasing depression severity [<xref ref-type="bibr" rid="ref47">47</xref>]. Çokal et al [<xref ref-type="bibr" rid="ref18">18</xref>] observed longer utterance-initial pauses in patients with FTD and more unfilled pauses in patients with schizophrenia without FTD. Belouali et al [<xref ref-type="bibr" rid="ref3">3</xref>] reported lower energy variability and flatter energy contours in suicidal speech, suggesting a dull and monotonous prosody.</p>
        </sec>
        <sec>
          <title>Spectral Features</title>
          <p>Many studies relied on MFCCs, spectral slope, formants, and related features. Gerczuk et al [<xref ref-type="bibr" rid="ref19">19</xref>] highlighted spectral slope (0-500 Hz), alpha ratio, and F1 bandwidth as predictive of suicide risk, with nuanced gender differences. Both F2 and spectral flux were found to be negatively associated with depression (indicating lower levels of energy and motivation), while a higher MFCC 4 was positively associated with depression [<xref ref-type="bibr" rid="ref21">21</xref>]. Yang et al [<xref ref-type="bibr" rid="ref50">50</xref>] emphasized specific frequency bands corresponding to Mandarin vowel formants that were positively associated with depression.</p>
        </sec>
        <sec>
          <title>Feature Rankings and Model-Driven Importance</title>
          <p>Some studies [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref53">53</xref>] analyzed feature relevance using importance rankings from models trained on the eGeMAPs and the ComParE feature set to identify features, such as voiced segments per second, spectral flux, and pitch percentiles, as top contributors for schizophrenia and depression, respectively. Stepanov et al [<xref ref-type="bibr" rid="ref48">48</xref>] found spectral features to be more predictive than prosodic or voice quality features for PHQ score prediction.</p>
          <p>While a few studies [<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref54">54</xref>] applied acoustic features without further interpretation, this diversity of biomarker use demonstrates the strength of acoustic feature–based models as interpretable alternatives to end-to-end models.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Within the fields of psychology and psychiatry, the emotional states of patients can be linked to their condition. This is the case both in a pathological sense, with patients with depression being characterized as having low moods associated with sadness, or in a symptomatic sense, where the emotional state of a patient with depression can vary depending on their current circumstances. While several works exist at the intersection of these fields and the emerging field of SER, the diversity in methodologies, datasets, and results makes it difficult to determine what the practical application of these results in a clinical context could look like.</p>
        <p>Nonetheless, the results presented in the selected studies are promising, with many of the models discussed earlier differentiating between pathological and healthy control groups at a significant rate (accuracy approximately 70%-80% and AUC approximately 0.8). The application of these technologies across a wide variety of domains is also promising, indicating that these technologies could be used for early screening (eg, in the diagnosis of psychotic disorders, where clinicians may struggle to guide patients early on).</p>
        <p>The use of the <italic>openSMILE</italic> toolkit and its various feature sets (eg, eGeMAPS) throughout several of the selected studies highlights the possibility of integrating SER approaches within a clinical context. Furthermore, the results achieved using these feature sets are promising, especially given their ease of use and interpretation.</p>
        <p>Recent research on the application of SER techniques in psychiatry appears promising, particularly in highlighting the potential clinical utility of such tools. As evidenced by several studies included in this review, SER can be used in between-subjects designs to differentiate patients from healthy controls or to distinguish between individuals with different psychiatric diagnoses, thereby providing support for the diagnostic process.</p>
        <p>Beyond diagnostic discrimination, the potential of SER extends to within-subject applications, particularly in the context of longitudinal monitoring. For instance, SER systems could be used to assess the daily emotional states of individuals with various psychiatric conditions—such as major depressive disorder or borderline personality disorder—for example, integrated into ecological momentary assessment frameworks.</p>
        <p>Moreover, SER technologies may hold predictive value with respect to clinical course, offering support in the monitoring of critical symptoms, such as SI, depressive, manic, or psychotic symptoms. This could help with the early identification and treatment of conditions that may pose significant risks to patients.</p>
        <p>Finally, SER tools may serve as a complementary measure in the assessment of treatment efficacy, providing objective data on emotional expression that can enrich traditional clinical evaluations.</p>
        <p>To the best of our knowledge, this study is the first systematic review to provide a synthesized overview of SER in psychiatry. This work could help standardize future studies and improve the reproducibility and comparability of results.</p>
      </sec>
      <sec>
        <title>Challenges and Limitations</title>
        <p>While SER holds promise for health care applications, several challenges and limitations must be addressed to realize its full potential. One such challenge is the exploration of multimodal emotion recognition, which involves integrating multiple sources of emotional cues, including speech, body language, and facial expressions. While SER primarily focuses on analyzing speech signals, incorporating additional modalities could enhance the accuracy and robustness of emotion recognition systems, particularly in complex health care settings.</p>
        <p>In addition, the interpretability of ML models and the ethical considerations surrounding the use of sensitive health data present significant challenges in the development and deployment of SER systems in health care settings. Ensuring transparency and accountability in the decision-making process of these algorithms is essential for building trust and confidence among health care professionals and patients. As outlined earlier, some complex models (ie, models whose added complexity makes clinical interpretation of predictions increasingly difficult) can achieve results superior to those of traditional ML methods applied to acoustic or prosodic features, which are more easily interpreted [<xref ref-type="bibr" rid="ref6">6</xref>]. If these methods are to be applied within a clinical context, a trade-off must be found between classification performance and interpretability.</p>
        <p>Furthermore, the generalization of SER algorithms across diverse populations and cultural contexts remains a significant hurdle. The major datasets used within the field of SER (eg, Interactive Emotional Dyadic Motion Capture) are in English, and as shown in Table S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, most of the studies included in this review examined either English-speaking participants or those who spoke Mandarin or other Chinese languages. In all cases, these datasets are composed of participants from a single cultural context, which may limit their applicability to the expression of emotion across different cultures. Variations in speech patterns, dialects, and cultural norms can impact the performance of SER systems, highlighting the need for robust validation and adaptation strategies to ensure the reliability and effectiveness of these systems across different demographics [<xref ref-type="bibr" rid="ref24">24</xref>].</p>
        <p>In addition to these limitations, translating NLP results into routine clinical practice in psychiatry—implying rapid, replicable, and scalable analysis—presents specific challenges.</p>
        <p>First, possible issues with the inaccuracy of vocal recording or transcription can hinder detailed analyses when integrating NLP tools into existing clinical workflows. Although linguistic analyses are generally considered fairly robust regarding the quality of the transcripts [<xref ref-type="bibr" rid="ref55">55</xref>], conducting fine-grained analysis, such as SER, may require higher-quality data, which is not always available in a clinical context.</p>
        <p>Addressing these challenges requires interdisciplinary collaboration among researchers, clinicians, and technologists to develop innovative solutions that prioritize patient privacy, data security, and ethical considerations while harnessing the potential of SER to enhance mental health assessment and treatment in health care settings.</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>Separately, the tasks of SER and mental health screening have been addressed using methods that span statistical analysis of acoustic measures, ML, and DL approaches. These techniques have been applied to a range of pathologies, conditions, and disorders within the field of mental health. In this paper, we have specifically addressed those works that exclusively use the audio from the speaker’s speech as the input to their models. Some works have proposed that acoustic measures of patients’ speech be developed as biomarkers of pathology, as it is the clinician’s perception of these measures that allows a diagnosis [<xref ref-type="bibr" rid="ref53">53</xref>]. This highlights these measures’ obvious advantage compared with the use of transformers and other DL models when collaborating in a clinical context: their ease of interpretation.</p>
        <p>Among the various applications, similarities can be observed, notably in the reuse of certain methods, especially the use of acoustic measures, both in mental health and SER, which can be linked back to the emotional state of the patient. Given that the same methods are used both for the recognition of pathologies and the recognition of participants’ emotional states, the question arises: Could the direct automated analysis of patients’ emotional states be useful for investigating the aforementioned pathologies?</p>
        <sec>
          <title>Proposed Pipelines for Audio Processing</title>
          <p>Here, two possible approaches are proposed for the detection of pathologies from patients’ speech. From our review, most studies adopt the first approach described in the subsequent section, which involves mapping directly from a patient’s speech to a possible pathology.</p>
          <sec>
            <title>Speech to Pathology</title>
            <p>This pipeline is represented in the vast majority of studies in the literature, in which feature extraction or ML and DL methods are applied directly to the signal from recordings of the participants.</p>
          </sec>
          <sec>
            <title>Speech to Emotion and Emotion to Pathology</title>
            <p>In this approach, speech from a patient is first analyzed using a SER system (based on one of the various methodologies outlined earlier). The emotional states recognized by this system are then analyzed to gain insight into the patient’s pathological status. Although some studies have adopted a similar approach [<xref ref-type="bibr" rid="ref20">20</xref>], this approach for the detection of mental health conditions has not been widely explored.</p>
            <p>The pipeline from speech to emotion, then emotion to pathology, presents a notable advantage in a clinical context, because it offers a clear interpretation of why a given classification was made. Furthermore, this approach can be used collaboratively with health care professionals to facilitate and complement their work.</p>
            <p>One possible challenge associated with this approach is the introduction of error by the SER system. To mitigate this risk, the selection of the SER system is of utmost importance, ensuring that its design (eg, training data) allows it to perform effectively on a given set of data.</p>
          </sec>
          <sec>
            <title>Shared Challenges</title>
            <p>From our review, the variety of methods observed (eg, a wide range of audio processing methods and feature sets, datasets of different sizes and compositions, and the use of various feature sets) led to difficulties in directly comparing different studies, even those applied to the same pathologies. This difficulty in comparison is similar to that encountered during the early days of the field of SER.</p>
            <p>Further emphasis on the organization of shared challenges could promote the sharing of methods, data, and results, thus, improving the comparability of work in the field, as was the case in SER. Several challenges have been conducted within this field (or adjacent ones), with the most recent including the Audio-Visual Emotion Challenge 2019 for depression [<xref ref-type="bibr" rid="ref56">56</xref>] and the Alzheimer dementia recognition through spontaneous speech challenge for Alzheimer disease [<xref ref-type="bibr" rid="ref57">57</xref>].</p>
            <p>One hurdle to overcome in the organization of these challenges, especially in a context relating to health care, is the sharing of confidential data.</p>
          </sec>
        </sec>
        <sec>
          <title>Dimensional Approach to Pathology</title>
          <p>Dimensional approaches to pathology in mental health can offer a more refined perspective on disorders by focusing on symptom severity and underlying mechanisms rather than rigid diagnostic categories.</p>
          <p>To be consistent with the papers we reported, we grouped mental health conditions in this review according to disease categories. In contrast to this categorical approach, a dimensional approach could provide different insights [<xref ref-type="bibr" rid="ref58">58</xref>]. In practice, this paradigm shift is conducive to the identification of (linguistic) biomarkers [<xref ref-type="bibr" rid="ref59">59</xref>]. This conceptual change in understanding psychiatric disorders aims to facilitate collaborative work by proposing a common framework through which specialists from different fields can study pathological mechanisms.</p>
          <p>Thus, the RDoC classification in psychiatry, described previously, proposes a dimensional understanding of mental disorders, facilitating the understanding of individual variability and accounting for the different clinical presentations among patients with the same pathology from a categorical perspective.</p>
          <p>In the same way, the Hierarchical Taxonomy of Psychopathology (HiTOP), proposed by Kotov et al [<xref ref-type="bibr" rid="ref60">60</xref>], is a promising model, especially for SER. Rather than replacing traditional categorical classifications, HiTOP hierarchically organizes symptoms into spectra and subfactors. It conceptualizes mental disorders along broad dimensions that reflect underlying emotional and behavioral patterns.</p>
          <p>Internalized disorders are characterized by self-directed emotional and psychological symptoms, such as anxiety, depression, or phobias. Individuals affected by these disorders tend to internalize their struggles, often isolating their emotions. Externalized disorders, on the other hand, manifest in outwardly directed disruptive behaviors, such as aggression, impulsivity, or transgression of social norms. Thought disorders (eg, schizophrenia) are characterized by confusion, detachment from reality, and unusual experiences. By framing psychopathology as a continuum, HiTOP moves beyond discrete diagnostic categories to capture the overlapping nature of emotional and behavioral dysfunctions.</p>
          <p>At the same time, the integration of NLP and emotion recognition into this approach is opening new perspectives. NLP enables the analysis of language data by detecting linguistic cues associated with mental disorders, such as depression or anxiety. These analyses facilitate the extraction of psychological and emotional dimensions, enriching the RDoC criteria with behavioral and cognitive data. The use of linguistic data in the RDoC approach could thus improve diagnostic processes, offering a more detailed and dynamic view of psychiatric disorders, and contribute to more personalized treatments adapted to the specific profiles of each patient. This convergence of neuroscientific and technological advances could truly transform the way psychiatry approaches mental disorders.</p>
          <p>It should be stressed that proposing a clear and systematic mapping from emotions to dimensional approaches to pathologies remains a complex task. Emotional models in SER are based on theoretical presuppositions derived from psychology or cognitive science, while psychiatric classification systems rely on distinct clinical and diagnostic logics, with sometimes divergent levels of granularity and objectives. For example, the circumplex model offers a fine-grained, continuous representation of emotions along the valence and arousal axes, but does not directly align with the diagnostic criteria of the <italic>DSM</italic> or the functional domains of the RDoC.</p>
        </sec>
        <sec>
          <title>Multimodal Approaches</title>
          <p>This study was limited to works that involved the analysis of audio exclusively; however, some of the included studies showed that a multimodal approach could outperform approaches based solely on the speech signal (eg, Mao et al [<xref ref-type="bibr" rid="ref49">49</xref>]). Many architectures have been proposed; one such example is the contrastive language-audio pretraining architecture [<xref ref-type="bibr" rid="ref61">61</xref>]. This model is based on the transformer architecture and is trained on audio and text pairs using 2 encoders. In its introduction, this model was shown to outperform state-of-the-art models in several speech-related tasks without the need to train the model on data related to those tasks (a technique referred to as 0-shot learning).</p>
          <p>While these approaches can be more effective than audio-only methods, it is worth noting that they can introduce more invasiveness to procedures, for example, through the recording of video.</p>
        </sec>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>SER is emerging as a promising tool in mental health care, offering potential for early detection, continuous monitoring, and personalized interventions. This approach is based on ML and artificial intelligence technologies and involves the recognition of emotions from recordings of patients in a health care context, followed by the analysis of these emotions, with possible applications of other ML methods to either the emotions from the model or the use of the output of the SER model as input into a classifier to distinguish between populations (eg, control or pathological groups, or other classes fitting the dataset at hand).</p>
        <p>This is the approach we suggest in the framework of the Apprentissage Profond pour l’Analyse Informatisée de la Subjectivité et des Emotions dans les troubles psychotiques émergents (DL for digital analysis of subjectivity and emotions in emerging psychotic disorders) project. This project, a collaboration between Sorbonne University, the French National Institute of Health, and the Brest University Hospital Centre, aims to develop automated methods for predicting psychotic transition among patients and, in doing so, improve the prognosis for patients. With this program, we aim to better understand the convergences and discontinuities between dimensional approaches, such as the RDoC, and models of emotion, to contribute to the development of speech analysis tools that are more aligned with clinical needs.</p>
        <p>SER presents several advantages in this context, as an objective, noninvasive technique that potentially offers real-time insights into patients’ emotional states, with a promising role in diagnosis support and clinical evolution monitoring.</p>
        <p>However, to be successful, interdisciplinary collaboration among computer scientists, psychologists, linguists, and health care professionals will be essential to refine these technologies and ensure their precise, nonbiased, and ethical implementation. Further work should explore the within-subject application of SER, particularly in the context of longitudinal monitoring. This approach would allow the regular assessment of patients’ emotional states and thus early identification and treatment of conditions, such as SI, depression, or psychotic episodes.</p>
        <p>Beyond that, the next challenge for SER applications in mental health will likely be to integrate SER with other clinical data, as well as biological, genetic, and imaging data, in large-scale multimodal analyses to better characterize and predict psychiatric disorders.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>The full Quality Assessment of Diagnostic Accuracy Studies 2 results.</p>
        <media xlink:href="mental_v12i1e74260_app1.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 20 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>PRISMA selected studies overview table.</p>
        <media xlink:href="mental_v12i1e74260_app2.pdf" xlink:title="PDF File  (Adobe PDF File), 896 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>PRISMA 2020 checklist.</p>
        <media xlink:href="mental_v12i1e74260_app3.pdf" xlink:title="PDF File  (Adobe PDF File), 669 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AUC</term>
          <def>
            <p>area under the curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">DAIC-WOZ</term>
          <def>
            <p>Distress Analysis Interview Corpus–Wizard of Oz</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">DL</term>
          <def>
            <p>deep learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">DSM</term>
          <def>
            <p>Diagnostic and Statistical Manual of Mental Disorders</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">eGeMAPs</term>
          <def>
            <p>extended Geneva Minimalistic Acoustic Parameter Set</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">FTD</term>
          <def>
            <p>formal thought disorder</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">HiTOP</term>
          <def>
            <p>Hierarchical Taxonomy of Psychopathology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LSTM</term>
          <def>
            <p>Long Short-Term Memory</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MFCC</term>
          <def>
            <p>Mel-frequency cepstral coefficient</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">NN</term>
          <def>
            <p>neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">openSMILE</term>
          <def>
            <p>open-source Speech and Music Interpretation by Large-Space Extraction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">PHQ-8</term>
          <def>
            <p>Patient Health Questionnaire-8</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">PRISMA</term>
          <def>
            <p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">RDoC</term>
          <def>
            <p>research domain criteria</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">SER</term>
          <def>
            <p>speech emotion recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">SI</term>
          <def>
            <p>suicidal ideation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">SVM</term>
          <def>
            <p>support vector machine</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to express their heartfelt gratitude to the Fondation de France and the Fondation de l’Avenir for their invaluable support. This work was also supported by the “Investissements d’Avenir” program of the French government, which is managed by the Agence Nationale de la Recherche, under the reference PsyCARE ANR-18-RHUS-0014.</p>
      <p>The authors also thank Jean-Marie Tshimula for his contribution to part of the initial version of this document during his contract for the project.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Latif</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Qadir</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Qayyum</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Usama</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Younis</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Speech technology for healthcare: opportunities, challenges, and state of the art</article-title>
          <source>IEEE Rev Biomed Eng</source>
          <year>2021</year>
          <volume>14</volume>
          <fpage>342</fpage>
          <lpage>56</lpage>
          <pub-id pub-id-type="doi">10.1109/rbme.2020.3006860</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ozseven</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Arpacioglu</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Comparative performance analysis of metaheuristic feature selection methods for speech emotion recognition</article-title>
          <source>Meas Sci Rev</source>
          <year>2024</year>
          <volume>24</volume>
          <issue>2</issue>
          <fpage>72</fpage>
          <lpage>82</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://sciendo.com/article/10.2478/msr-2024-0010"/>
          </comment>
          <pub-id pub-id-type="doi">10.2478/msr-2024-0010</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Belouali</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sourirajan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Allen</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Alaoui</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dutton</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Reinhard</surname>
              <given-names>MJ</given-names>
            </name>
          </person-group>
          <article-title>Acoustic and language analysis of speech for suicidal ideation among US veterans</article-title>
          <source>BioData Min</source>
          <year>2021</year>
          <month>02</month>
          <day>02</day>
          <volume>14</volume>
          <issue>1</issue>
          <fpage>11</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://biodatamining.biomedcentral.com/articles/10.1186/s13040-021-00245-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13040-021-00245-y</pub-id>
          <pub-id pub-id-type="medline">33531048</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13040-021-00245-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC7856815</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>KY</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>CH</given-names>
            </name>
          </person-group>
          <article-title>Mood disorder identification using deep bottleneck features of elicited speech</article-title>
          <source>Proceedings of the Asia-Pacific Signal and Information Processing Association Annual Summit and Conference</source>
          <year>2017</year>
          <conf-name>APSIPA ASC 2017</conf-name>
          <conf-date>December 12-15, 2017</conf-date>
          <conf-loc>Kuala Lumpur, Malaysia</conf-loc>
          <pub-id pub-id-type="doi">10.1109/apsipa.2017.8282296</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Milling</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Baird</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bartl-Pokorny</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Alcorn</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tavassoli</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Ainger</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Pellicano</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Pantic</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cummins</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>BW</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the impact of voice activity detection on speech emotion recognition for autistic children</article-title>
          <source>Front Comput Sci</source>
          <year>2022</year>
          <volume>4</volume>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.frontiersin.org/journals/computer-science/articles/10.3389/fcomp.2022.837269/full"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fcomp.2022.837269</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Depression speech recognition with a three-dimensional convolutional network</article-title>
          <source>Front Hum Neurosci</source>
          <year>2021</year>
          <month>9</month>
          <day>30</day>
          <volume>15</volume>
          <fpage>713823</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/34658815"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fnhum.2021.713823</pub-id>
          <pub-id pub-id-type="medline">34658815</pub-id>
          <pub-id pub-id-type="pmcid">PMC8514878</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dellaert</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Polzin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Waibel</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Recognizing emotion in speech</article-title>
          <source>Proceeding of Fourth International Conference on Spoken Language Processing</source>
          <year>1996</year>
          <conf-name>ICSLP '96</conf-name>
          <conf-date>October 3-6, 1996</conf-date>
          <conf-loc>Philadelphia, PA</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/608022"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/icslp.1996-462</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>BW</given-names>
            </name>
          </person-group>
          <article-title>Speech emotion recognition: two decades in a nutshell, benchmarks, and ongoing trends</article-title>
          <source>Commun ACM</source>
          <year>2018</year>
          <month>04</month>
          <day>24</day>
          <volume>61</volume>
          <issue>5</issue>
          <fpage>90</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1145/3129340</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ekman</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sorenson</surname>
              <given-names>ER</given-names>
            </name>
            <name name-style="western">
              <surname>Friesen</surname>
              <given-names>WV</given-names>
            </name>
          </person-group>
          <article-title>Pan-cultural elements in facial displays of emotion</article-title>
          <source>Science</source>
          <year>1969</year>
          <month>04</month>
          <day>04</day>
          <volume>164</volume>
          <issue>3875</issue>
          <fpage>86</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1126/science.164.3875.86</pub-id>
          <pub-id pub-id-type="medline">5773719</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Plutchik</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <source>Emotion: A Psychoevolutionary Synthesis</source>
          <year>1980</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Harper &#38; Row</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Russell</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>A circumplex model of affect</article-title>
          <source>J Pers Soc Psychol</source>
          <year>1980</year>
          <month>12</month>
          <volume>39</volume>
          <issue>6</issue>
          <fpage>1161</fpage>
          <lpage>78</lpage>
          <pub-id pub-id-type="doi">10.1037/h0077714</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Plaza-del-Arco</surname>
              <given-names>FM</given-names>
            </name>
            <name name-style="western">
              <surname>Cercas Curry</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Cercas Curry</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hovy</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Emotion analysis in NLP: trends, gaps and roadmap for future directions</article-title>
          <source>Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation</source>
          <year>2024</year>
          <conf-name>LREC-COLING 2024</conf-name>
          <conf-date>May 20-25, 2024</conf-date>
          <conf-loc>Torino, Italia</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2024.lrec-main.506"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Steidl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Batliner</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>The INTERSPEECH 2009 emotion challenge</article-title>
          <source>Proceedings of the 10th Annual Conference of the International Speech Communication Association</source>
          <year>2009</year>
          <conf-name>INTERSPEECH 2009</conf-name>
          <conf-date>September 9-10, 2009</conf-date>
          <conf-loc>Brighton, UK</conf-loc>
          <pub-id pub-id-type="doi">10.21437/interspeech.2009-103</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eyben</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wöllmer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Opensmile: the Munich versatile and fast open-source audio feature extractor</article-title>
          <source>Proceedings of the 18th ACM International Conference on Multimedia</source>
          <year>2010</year>
          <conf-name>MM '10</conf-name>
          <conf-date>October 25-29, 2010</conf-date>
          <conf-loc>Firenze, Italy</conf-loc>
          <pub-id pub-id-type="doi">10.1145/1873951.1874246</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abdel-Hamid</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Mohamed</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Penn</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Applying Convolutional Neural Networks concepts to hybrid NN-HMM model for speech recognition</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2012</year>
          <conf-name>ICASSP 2012</conf-name>
          <conf-date>March 25-30, 2012</conf-date>
          <conf-loc>Kyoto, Japan</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp.2012.6288864</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Triantafyllopoulos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Batliner</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rampp</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Milling</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>INTERSPEECH 2009 emotion challenge revisited: benchmarking 15 years of progress in speech emotion recognition</article-title>
          <source>ArXiv. Preprint posted online on June 10, 2024</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2406.06401"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2024-97</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cuthbert</surname>
              <given-names>BN</given-names>
            </name>
            <name name-style="western">
              <surname>Insel</surname>
              <given-names>TR</given-names>
            </name>
          </person-group>
          <article-title>Toward the future of psychiatric diagnosis: the seven pillars of RDoC</article-title>
          <source>BMC Med</source>
          <year>2013</year>
          <month>05</month>
          <day>14</day>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>126</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedicine.biomedcentral.com/articles/10.1186/1741-7015-11-126"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/1741-7015-11-126</pub-id>
          <pub-id pub-id-type="medline">23672542</pub-id>
          <pub-id pub-id-type="pii">1741-7015-11-126</pub-id>
          <pub-id pub-id-type="pmcid">PMC3653747</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Çokal</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zimmerer</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Turkington</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ferrier</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Varley</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Watson</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hinzen</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Disturbing the rhythm of thought: speech pausing patterns in schizophrenia, with and without formal thought disorder</article-title>
          <source>PLoS One</source>
          <year>2019</year>
          <month>5</month>
          <day>31</day>
          <volume>14</volume>
          <issue>5</issue>
          <fpage>e0217404</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0217404"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0217404</pub-id>
          <pub-id pub-id-type="medline">31150442</pub-id>
          <pub-id pub-id-type="pii">PONE-D-18-34437</pub-id>
          <pub-id pub-id-type="pmcid">PMC6544238</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gerczuk</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Amiriparian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lutz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Strube</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Papazova</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Hasan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>BW</given-names>
            </name>
          </person-group>
          <article-title>Exploring gender-specific speech patterns in automatic suicide risk assessment</article-title>
          <source>ArXiv. Preprint posted online on June 26, 2024</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2407.11012"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2024-1097</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gideon</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Schatten</surname>
              <given-names>HT</given-names>
            </name>
            <name name-style="western">
              <surname>McInnis</surname>
              <given-names>MG</given-names>
            </name>
            <name name-style="western">
              <surname>Provost</surname>
              <given-names>EM</given-names>
            </name>
          </person-group>
          <article-title>Emotion recognition from natural phone conversations in individuals with and without recent suicidal ideation</article-title>
          <source>Proceedings of the INTERSPEECH 2019</source>
          <year>2019</year>
          <conf-name>INTERSPEECH 2019</conf-name>
          <conf-date>September 15-19, 2019</conf-date>
          <conf-loc>Graz, Austria</conf-loc>
          <pub-id pub-id-type="doi">10.21437/interspeech.2019-1830</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Xue</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Developing a machine learning model for detecting depression, anxiety, and apathy in older adults with mild cognitive impairment using speech and facial expressions: a cross-sectional observational study</article-title>
          <source>Int J Nurs Stud</source>
          <year>2023</year>
          <month>10</month>
          <volume>146</volume>
          <fpage>104562</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijnurstu.2023.104562</pub-id>
          <pub-id pub-id-type="medline">37531702</pub-id>
          <pub-id pub-id-type="pii">S0020-7489(23)00127-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khare</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Blanes-Vidal</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Nadimi</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Acharya</surname>
              <given-names>UR</given-names>
            </name>
          </person-group>
          <article-title>Emotion recognition and artificial intelligence: a systematic review (2014–2023) and research recommendations</article-title>
          <source>Inf Fusion</source>
          <year>2024</year>
          <month>02</month>
          <volume>102</volume>
          <fpage>102019</fpage>
          <pub-id pub-id-type="doi">10.1016/j.inffus.2023.102019</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Speech emotion recognition using auditory spectrogram and cepstral features</article-title>
          <source>Proceedings of the 29th European Signal Processing Conference</source>
          <year>2021</year>
          <conf-name>EUSIPCO 2021</conf-name>
          <conf-date>August 23-27, 2021</conf-date>
          <conf-loc>Dublin, Ireland</conf-loc>
          <pub-id pub-id-type="doi">10.23919/eusipco54536.2021.9616144</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Lope</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Graña</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>An ongoing review of speech emotion recognition</article-title>
          <source>Neurocomputing</source>
          <year>2023</year>
          <month>04</month>
          <volume>528</volume>
          <fpage>1</fpage>
          <lpage>11</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neucom.2023.01.002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>GK</given-names>
            </name>
          </person-group>
          <article-title>Evaluating gammatone frequency cepstral coefficients with neural networks for emotion recognition from speech</article-title>
          <source>ArXiv. Preprint posted online on June 23, 2018</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1806.09010"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al Dujaili</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ebrahimi-Moghadam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fatlawi</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Speech emotion recognition based on SVM and KNN classifications fusion</article-title>
          <source>Int J Electr Comput Eng</source>
          <year>2021</year>
          <month>04</month>
          <day>01</day>
          <volume>11</volume>
          <issue>2</issue>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ijece.iaescore.com/index.php/IJECE/article/view/21086"/>
          </comment>
          <pub-id pub-id-type="doi">10.11591/ijece.v11i2.pp1259-1264</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Akinpelu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Viriri</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Speech emotion classification using attention based network and regularized feature selection</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <month>07</month>
          <day>25</day>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>11990</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-38868-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-38868-2</pub-id>
          <pub-id pub-id-type="medline">37491423</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-38868-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC10368662</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ismaiel</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Alhalangy</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mohamed</surname>
              <given-names>AO</given-names>
            </name>
            <name name-style="western">
              <surname>Musa</surname>
              <given-names>AI</given-names>
            </name>
          </person-group>
          <article-title>Deep learning, ensemble and supervised machine learning for Arabic speech emotion recognition</article-title>
          <source>Eng Technol Appl Sci Res</source>
          <year>2024</year>
          <month>04</month>
          <day>02</day>
          <volume>14</volume>
          <issue>2</issue>
          <fpage>13757</fpage>
          <lpage>64</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.etasr.com/index.php/ETASR/article/view/7134"/>
          </comment>
          <pub-id pub-id-type="doi">10.48084/etasr.7134</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al Dujaili</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ebrahimi-Moghadam</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Automatic speech emotion recognition based on hybrid features with ANN, LDA and K_NN classifiers</article-title>
          <source>Multimed Tools Appl</source>
          <year>2023</year>
          <month>04</month>
          <day>22</day>
          <volume>82</volume>
          <issue>27</issue>
          <fpage>42783</fpage>
          <lpage>801</lpage>
          <pub-id pub-id-type="doi">10.1007/s11042-023-15413-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nath</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shahi</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Choudhury</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Mandal</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Speech emotion recognition using machine learning: a comparative analysis</article-title>
          <source>SN Comput Sci</source>
          <year>2024</year>
          <month>04</month>
          <day>04</day>
          <volume>5</volume>
          <fpage>390</fpage>
          <pub-id pub-id-type="doi">10.1007/s42979-024-02656-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abdelhamid</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>El-Kenawy</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Alotaibi</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Amer</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Abdelkader</surname>
              <given-names>MY</given-names>
            </name>
            <name name-style="western">
              <surname>Ibrahim</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Eid</surname>
              <given-names>MM</given-names>
            </name>
          </person-group>
          <article-title>Robust speech emotion recognition using CNN+LSTM based on stochastic fractal search optimization algorithm</article-title>
          <source>IEEE Access</source>
          <year>2022</year>
          <volume>10</volume>
          <fpage>49265</fpage>
          <lpage>84</lpage>
          <pub-id pub-id-type="doi">10.1109/access.2022.3172954</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soltau</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Sak</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Neural speech recognizer: acoustic-to-word LSTM model for large vocabulary speech recognition</article-title>
          <source>ArXiv. Preprint posted online on October 31, 2016</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1610.09975"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2017-1566</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Senthilkumar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Karpakam</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gayathri Devi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Balakumaresan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Dhilipkumar</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Speech emotion recognition based on bi-directional LSTM architecture and deep belief networks</article-title>
          <source>Mater Today Proc</source>
          <year>2022</year>
          <volume>57</volume>
          <issue>5</issue>
          <fpage>2180</fpage>
          <lpage>4</lpage>
          <pub-id pub-id-type="doi">10.1016/j.matpr.2021.12.246</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Etienne</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Fidanza</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Petrovskii</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Devillers</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Schmauch</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>CNN+LSTM architecture for speech emotion recognition with data augmentation</article-title>
          <source>ArXiv. Preprint posted online on February 15, 2018</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1802.05630"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/SMM.2018-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Ko</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Fusion-ConvBERT: parallel convolution and BERT fusion for speech emotion recognition</article-title>
          <source>Sensors (Basel)</source>
          <year>2020</year>
          <month>11</month>
          <day>23</day>
          <volume>20</volume>
          <issue>22</issue>
          <fpage>6688</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=s20226688"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/s20226688</pub-id>
          <pub-id pub-id-type="medline">33238396</pub-id>
          <pub-id pub-id-type="pii">s20226688</pub-id>
          <pub-id pub-id-type="pmcid">PMC7700332</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>YA</given-names>
            </name>
            <name name-style="western">
              <surname>Glass</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>AST: audio spectrogram transformer</article-title>
          <source>ArXiv. Preprint posted online on April 5, 2021</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2104.01778"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Attention-augmented end-to-end multi-task learning for emotion prediction from speech</article-title>
          <source>ArXiv. Preprint posted online on March 29, 2019</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1903.12424"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1903.12424</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ullah</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Asif</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>WA</given-names>
            </name>
            <name name-style="western">
              <surname>Anjam</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Ullah</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Khurshaid</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wuttisittikulkij</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Alibakhshikenari</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Speech emotion recognition using convolution neural networks and multi-head convolutional transformer</article-title>
          <source>Sensors (Basel)</source>
          <year>2023</year>
          <month>07</month>
          <day>07</day>
          <volume>23</volume>
          <issue>13</issue>
          <fpage>6212</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=s23136212"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/s23136212</pub-id>
          <pub-id pub-id-type="medline">37448062</pub-id>
          <pub-id pub-id-type="pii">s23136212</pub-id>
          <pub-id pub-id-type="pmcid">PMC10346498</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Byun</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Multimodal speech emotion recognition using audio and text</article-title>
          <source>ArXiv. Preprint posted online on October 10, 2018</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/1810.04635"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/slt.2018.8639583</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Saheer</surname>
              <given-names>LB</given-names>
            </name>
            <name name-style="western">
              <surname>Faust</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Speech emotion recognition using attention model</article-title>
          <source>Int J Environ Res Public Health</source>
          <year>2023</year>
          <month>03</month>
          <day>14</day>
          <volume>20</volume>
          <issue>6</issue>
          <fpage>5140</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=ijerph20065140"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/ijerph20065140</pub-id>
          <pub-id pub-id-type="medline">36982048</pub-id>
          <pub-id pub-id-type="pii">ijerph20065140</pub-id>
          <pub-id pub-id-type="pmcid">PMC10049636</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gratch</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Artstein</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lucas</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Stratou</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Scherer</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nazarian</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wood</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Boberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>DeVault</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Marsella</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Traum</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rizzo</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Morency</surname>
              <given-names>LP</given-names>
            </name>
          </person-group>
          <article-title>The distress analysis interview corpus of human and computer interviews</article-title>
          <source>Proceedings of the Ninth International Conference on Language Resources and Evaluation</source>
          <year>2014</year>
          <conf-name>LREC'14</conf-name>
          <conf-date>May 26-31, 2014</conf-date>
          <conf-loc>Reykjavik, Iceland</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/L14-1421/"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Livingstone</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Russo</surname>
              <given-names>FA</given-names>
            </name>
          </person-group>
          <article-title>The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): a dynamic, multimodal set of facial and vocal expressions in North American English</article-title>
          <source>PLoS One</source>
          <year>2018</year>
          <month>5</month>
          <day>16</day>
          <volume>13</volume>
          <issue>5</issue>
          <fpage>e0196391</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0196391"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0196391</pub-id>
          <pub-id pub-id-type="medline">29768426</pub-id>
          <pub-id pub-id-type="pii">PONE-D-17-28472</pub-id>
          <pub-id pub-id-type="pmcid">PMC5955500</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Steidl</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <source>Automatic Classification of Emotion-related User States in Spontaneous Children's Speech</source>
          <year>2009</year>
          <publisher-loc>Berlin, Germany</publisher-loc>
          <publisher-name>Logos-Verlag</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Busso</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bulut</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Kazemzadeh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mower</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Narayanan</surname>
              <given-names>SS</given-names>
            </name>
          </person-group>
          <article-title>IEMOCAP: interactive emotional dyadic motion capture database</article-title>
          <source>Lang Resour Eval</source>
          <year>2008</year>
          <month>11</month>
          <day>5</day>
          <volume>42</volume>
          <fpage>335</fpage>
          <lpage>59</lpage>
          <pub-id pub-id-type="doi">10.1007/s10579-008-9076-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gournay</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lahaie</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Lefebvre</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>A Canadian French emotional speech dataset</article-title>
          <source>Proceedings of the 9th ACM Multimedia Systems Conference</source>
          <year>2018</year>
          <conf-name>MMSys '18</conf-name>
          <conf-date>June 12-15, 2018</conf-date>
          <conf-loc>Amsterdam, Netherlands</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3204949.3208121</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>KY</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>MH</given-names>
            </name>
          </person-group>
          <article-title>Detection of mood disorder using speech emotion profiles and LSTM</article-title>
          <source>Proceedings of the 10th International Symposium on Chinese Spoken Language Processing</source>
          <year>2016</year>
          <conf-name>ISCSLP 2016</conf-name>
          <conf-date>October 17-20, 2016</conf-date>
          <conf-loc>Tianjin, China</conf-loc>
          <pub-id pub-id-type="doi">10.1109/iscslp.2016.7918439</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fairbairn</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Cohn</surname>
              <given-names>JF</given-names>
            </name>
          </person-group>
          <article-title>Detecting depression severity from vocal prosody</article-title>
          <source>IEEE Trans Affect Comput</source>
          <year>2013</year>
          <volume>4</volume>
          <issue>2</issue>
          <fpage>142</fpage>
          <lpage>50</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/26985326"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/T-AFFC.2012.38</pub-id>
          <pub-id pub-id-type="medline">26985326</pub-id>
          <pub-id pub-id-type="pmcid">PMC4791067</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stepanov</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Lathuiliere</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhury</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Ghosh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vieriu</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Sebe</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Riccardi</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Depression severity estimation from multiple modalities</article-title>
          <source>ArXiv. Preprint posted online on November 10, 2017</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1711.06095"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/healthcom.2018.8531119</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mao</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>DB</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jiao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lyu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Prediction of depression severity based on the prosodic and semantic features with bidirectional LSTM and time distributed CNN</article-title>
          <source>IEEE Trans Affect Comput</source>
          <year>2023</year>
          <month>7</month>
          <day>1</day>
          <volume>14</volume>
          <issue>3</issue>
          <fpage>2251</fpage>
          <lpage>65</lpage>
          <pub-id pub-id-type="doi">10.1109/taffc.2022.3154332</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>JK</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Attention guided learnable time-domain filterbanks for speech depression detection</article-title>
          <source>Neural Netw</source>
          <year>2023</year>
          <month>08</month>
          <volume>165</volume>
          <fpage>135</fpage>
          <lpage>49</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neunet.2023.05.041</pub-id>
          <pub-id pub-id-type="medline">37285730</pub-id>
          <pub-id pub-id-type="pii">S0893-6080(23)00285-X</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chakraborty</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tahir</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Maszczyk</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Dauwels</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Thalmann</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Maniam</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Amirah</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Prediction of negative symptoms of schizophrenia from emotion related low-level speech signals</article-title>
          <source>Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing</source>
          <year>2018</year>
          <conf-name>ICASSP 2018</conf-name>
          <conf-date>April 15-20, 2018</conf-date>
          <conf-loc>Calgary, AB</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp.2018.8462102</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eyben</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Scherer</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>BW</given-names>
            </name>
            <name name-style="western">
              <surname>Sundberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Andre</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Busso</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Devillers</surname>
              <given-names>LY</given-names>
            </name>
            <name name-style="western">
              <surname>Epps</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Laukka</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Narayanan</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Truong</surname>
              <given-names>KP</given-names>
            </name>
          </person-group>
          <article-title>The Geneva Minimalistic Acoustic Parameter Set (GeMAPS) for voice research and affective computing</article-title>
          <source>IEEE Trans Affect Comput</source>
          <year>2016</year>
          <month>4</month>
          <day>1</day>
          <volume>7</volume>
          <issue>2</issue>
          <fpage>190</fpage>
          <lpage>202</lpage>
          <pub-id pub-id-type="doi">10.1109/taffc.2015.2457417</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>de Boer</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Voppel</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Brederoo</surname>
              <given-names>SG</given-names>
            </name>
            <name name-style="western">
              <surname>Schnack</surname>
              <given-names>HG</given-names>
            </name>
            <name name-style="western">
              <surname>Truong</surname>
              <given-names>KP</given-names>
            </name>
            <name name-style="western">
              <surname>Wijnen</surname>
              <given-names>FN</given-names>
            </name>
            <name name-style="western">
              <surname>Sommer</surname>
              <given-names>IE</given-names>
            </name>
          </person-group>
          <article-title>Acoustic speech markers for schizophrenia-spectrum disorders: a diagnostic and symptom-recognition tool</article-title>
          <source>Psychol Med</source>
          <year>2021</year>
          <month>08</month>
          <day>04</day>
          <volume>53</volume>
          <issue>4</issue>
          <fpage>1302</fpage>
          <lpage>12</lpage>
          <pub-id pub-id-type="doi">10.1017/s0033291721002804</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hansen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>YP</given-names>
            </name>
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sechidis</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ladegaard</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fusaroli</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>A generalizable speech emotion recognition model reveals depression and remission</article-title>
          <source>Acta Psychiatr Scand</source>
          <year>2022</year>
          <month>02</month>
          <volume>145</volume>
          <issue>2</issue>
          <fpage>186</fpage>
          <lpage>99</lpage>
          <pub-id pub-id-type="doi">10.1111/acps.13388</pub-id>
          <pub-id pub-id-type="medline">34850386</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Corcoran</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Mittal</surname>
              <given-names>VA</given-names>
            </name>
            <name name-style="western">
              <surname>Bearden</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>E Gur</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hitczenko</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bilgrami</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Savic</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cecchi</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Wolff</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Language as a biomarker for psychosis: a natural language processing approach</article-title>
          <source>Schizophr Res</source>
          <year>2020</year>
          <month>12</month>
          <volume>226</volume>
          <fpage>158</fpage>
          <lpage>66</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32499162"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.schres.2020.04.032</pub-id>
          <pub-id pub-id-type="medline">32499162</pub-id>
          <pub-id pub-id-type="pii">S0920-9964(20)30247-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC7704556</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ringeval</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Schuller</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Valstar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cummins</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Cowie</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tavabi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Schmitt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Alisamir</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Amiriparian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Messner</surname>
              <given-names>EM</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Mallol-Ragolta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Soleymani</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pantic</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>AVEC 2019 workshop and challenge: state-of-mind, detecting depression with AI, and cross-cultural affect recognition</article-title>
          <source>Proceedings of the 9th International on Audio/Visual Emotion Challenge and Workshop</source>
          <year>2019</year>
          <conf-name>AVEC '19</conf-name>
          <conf-date>October 21, 2019</conf-date>
          <conf-loc>Nice, France</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3347320.3357688</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Luz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Haider</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>de la Fuente</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fromm</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>MacWhinney</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Alzheimer's dementia recognition through spontaneous speech: the ADReSS challenge</article-title>
          <source>ArXiv. Preprint posted online on April 14, 2020</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2004.06833"/>
          </comment>
          <pub-id pub-id-type="doi">10.21437/interspeech.2020-2571</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Insel</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Cuthbert</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Garvey</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Heinssen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Pine</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Quinn</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Sanislow</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Research domain criteria (RDoC): toward a new classification framework for research on mental disorders</article-title>
          <source>Am J Psychiatry</source>
          <year>2010</year>
          <month>07</month>
          <volume>167</volume>
          <issue>7</issue>
          <fpage>748</fpage>
          <lpage>51</lpage>
          <pub-id pub-id-type="doi">10.1176/appi.ajp.2010.09091379</pub-id>
          <pub-id pub-id-type="medline">20595427</pub-id>
          <pub-id pub-id-type="pii">167/7/748</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Clarke</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cryan</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Dinan</surname>
              <given-names>TG</given-names>
            </name>
          </person-group>
          <article-title>Dimensional thinking in psychiatry in the era of the Research Domain Criteria (RDoC)</article-title>
          <source>Ir J Psychol Med</source>
          <year>2018</year>
          <month>06</month>
          <day>05</day>
          <volume>35</volume>
          <issue>2</issue>
          <fpage>89</fpage>
          <lpage>94</lpage>
          <pub-id pub-id-type="doi">10.1017/ipm.2017.7</pub-id>
          <pub-id pub-id-type="medline">30115193</pub-id>
          <pub-id pub-id-type="pii">S0790966717000076</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kotov</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Krueger</surname>
              <given-names>RF</given-names>
            </name>
            <name name-style="western">
              <surname>Watson</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Achenbach</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Althoff</surname>
              <given-names>RR</given-names>
            </name>
            <name name-style="western">
              <surname>Bagby</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Carpenter</surname>
              <given-names>WT</given-names>
            </name>
            <name name-style="western">
              <surname>Caspi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Eaton</surname>
              <given-names>NR</given-names>
            </name>
            <name name-style="western">
              <surname>Forbes</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Forbush</surname>
              <given-names>KT</given-names>
            </name>
            <name name-style="western">
              <surname>Goldberg</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hasin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hyman</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Ivanova</surname>
              <given-names>MY</given-names>
            </name>
            <name name-style="western">
              <surname>Lynam</surname>
              <given-names>DR</given-names>
            </name>
            <name name-style="western">
              <surname>Markon</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Moffitt</surname>
              <given-names>TE</given-names>
            </name>
            <name name-style="western">
              <surname>Morey</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Mullins-Sweatt</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Ormel</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Patrick</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Regier</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Rescorla</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ruggero</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Samuel</surname>
              <given-names>DB</given-names>
            </name>
            <name name-style="western">
              <surname>Sellbom</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Simms</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Skodol</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Slade</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>South</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Tackett</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Waldman</surname>
              <given-names>ID</given-names>
            </name>
            <name name-style="western">
              <surname>Waszczuk</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Widiger</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>AG</given-names>
            </name>
            <name name-style="western">
              <surname>Zimmerman</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The Hierarchical Taxonomy of Psychopathology (HiTOP): a dimensional alternative to traditional nosologies</article-title>
          <source>J Abnorm Psychol</source>
          <year>2017</year>
          <month>05</month>
          <volume>126</volume>
          <issue>4</issue>
          <fpage>454</fpage>
          <lpage>77</lpage>
          <pub-id pub-id-type="doi">10.1037/abn0000258</pub-id>
          <pub-id pub-id-type="medline">28333488</pub-id>
          <pub-id pub-id-type="pii">2017-12889-001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elizalde</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Deshmukh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ismail</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>CLAP: learning audio concepts from natural language supervision</article-title>
          <source>ArXiv. Preprint posted online on June 9, 2022</source>
          <year>2025</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2206.04769"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/icassp49357.2023.10095889</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
