<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Ment Health</journal-id><journal-id journal-id-type="publisher-id">mental</journal-id><journal-id journal-id-type="index">16</journal-id><journal-title>JMIR Mental Health</journal-title><abbrev-journal-title>JMIR Ment Health</abbrev-journal-title><issn pub-type="epub">2368-7959</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e83352</article-id><article-id pub-id-type="doi">10.2196/83352</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Prediction of 12-Week Remission in Patients With Depressive Disorder Using Reasoning-Based Large Language Models: Model Development and Validation Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Park</surname><given-names>Jin-Hyun</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Kang</surname><given-names>Hee-Ju</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jeon</surname><given-names>Ji Hyeon</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kang</surname><given-names>Sung-Gil</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Ju-Wan</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kim</surname><given-names>Jae-Min</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Lee</surname><given-names>Hwamin</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Biomedical Informatics, Korea University College of Medicine</institution><addr-line>161, Jeongneung-ro, Seongbuk-gu</addr-line><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff2"><institution>Department of Psychiatry, Chonnam National University Medical School</institution><addr-line>Gwangju</addr-line><country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Torous</surname><given-names>John</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Strobl</surname><given-names>Eric</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Qihua</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Hwamin Lee, PhD, Department of Biomedical Informatics, Korea University College of Medicine, 161, Jeongneung-ro, Seongbuk-gu, Seoul, 02708, Republic of Korea, 82 2-3407-2099; <email>hwamin@korea.ac.kr</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>23</day><month>1</month><year>2026</year></pub-date><volume>13</volume><elocation-id>e83352</elocation-id><history><date date-type="received"><day>01</day><month>09</month><year>2025</year></date><date date-type="rev-recd"><day>10</day><month>12</month><year>2025</year></date><date date-type="accepted"><day>11</day><month>12</month><year>2025</year></date></history><copyright-statement>&#x00A9; Jin-Hyun Park, Hee-Ju Kang, Ji Hyeon Jeon, Sung-Gil Kang, Ju-Wan Kim, Jae-Min Kim, Hwamin Lee. Originally published in JMIR Mental Health (<ext-link ext-link-type="uri" xlink:href="https://mental.jmir.org">https://mental.jmir.org</ext-link>), 23.1.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Mental Health, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mental.jmir.org/">https://mental.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mental.jmir.org/2026/1/e83352"/><abstract><sec><title>Background</title><p>Depressive disorder affects over 300 million people globally, with only 30% to 40% of patients achieving remission with initial antidepressant monotherapy. This low response rate highlights the critical need for digital mental health tools that can identify treatment response early in the clinical pathway.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate whether reasoning-based large language models (LLMs) could accurately predict 12-week remission in patients with depressive disorder undergoing antidepressant monotherapy and to assess the clinical validity and interpretability of model-generated rationales for integration into digital mental health workflows.</p></sec><sec sec-type="methods"><title>Methods</title><p>We analyzed data from 390 patients in the MAKE Biomarker discovery study who were undergoing first-step antidepressant monotherapy with 12 different medications, including escitalopram, paroxetine, sertraline, duloxetine, venlafaxine, desvenlafaxine, milnacipran, mirtazapine, bupropion, vortioxetine, tianeptine, and trazodone, after excluding those with uncommon medications (n=9) or missing biomarker data (n=32). Three LLMs (ChatGPT o1, o3-mini, and Claude 3.7 Sonnet) were tested using advanced prompting strategies, including zero-shot chain-of-thought, atom-of-thoughts, and our novel referencing of deep research prompt. Model performance was evaluated using balanced accuracy, sensitivity, specificity, positive predictive value, and negative predictive value. Three psychiatrists independently assessed model outputs for clinical validity using 5-point Likert scales across multiple dimensions.</p></sec><sec sec-type="results"><title>Results</title><p>Claude 3.7 Sonnet with 32,000 reasoning tokens using the referencing of deep research prompt achieved the highest performance (balanced accuracy=0.6697, sensitivity=0.7183, and specificity=0.6210). Medication-specific analysis revealed negative predictive values of 0.75 or higher across major antidepressants, indicating particular utility in identifying likely nonresponders. Clinical evaluation by psychiatrists showed favorable mean ratings for correctness (4.3, SD 0.7), consistency (4.2, SD 0.8), specificity (4.2, SD 0.7), helpfulness (4.2, SD 1.0), and human likeness (3.6, SD 1.7) on 5-point scales.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>These findings demonstrate that reasoning-based LLMs, particularly when enhanced with research-informed prompting, show promise for predicting antidepressant response and could serve as interpretable adjunctive tools in depressive disorder treatment planning, although prospective validation in real-world clinical settings remains essential.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>clinical support systems</kwd><kwd>depressive disorder</kwd><kwd>large language models</kwd><kwd>natural language processing</kwd><kwd>prognosis</kwd><kwd>treatment outcome</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Depressive disorder is one of the most prevalent and debilitating psychiatric conditions worldwide, ranking as a primary contributor to global disability and significantly influencing the overall disease burden associated with mental disorders [<xref ref-type="bibr" rid="ref1">1</xref>]. Given the substantial burden imposed by depressive disorder, optimizing strategies for early diagnosis, effective treatment, and personalized intervention remains a critical public health priority. Despite the critical need for effective intervention, the primary treatment objective of achieving remission, defined as near-complete symptom resolution, remains challenging, with initial antidepressant monotherapy resulting in remission rates of only 30% to 40% within 12 weeks [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. This limited success often necessitates multiple treatment trials, consequently prolonging suffering, increasing health care use and suicide risk, elevating dropout rates [<xref ref-type="bibr" rid="ref4">4</xref>], and ultimately exacerbating patient distress while significantly amplifying treatment nonadherence [<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>Consequently, the early identification of patients who will not achieve remission with a particular monotherapy regimen has become a critical topic in both research and clinical practice. Early identification of patients who are less likely to respond to standard first-line treatments would allow clinicians to tailor interventions more efficiently and reduce the time lost during ineffective treatments [<xref ref-type="bibr" rid="ref6">6</xref>]. Recent studies have explored the use of machine learning (ML) models to predict remission in patients with depressive disorder. However, these investigations have encountered limitations, resulting from study design, which may not reflect real-world clinical practice, including limited diversity in the antidepressants administered and challenges in clinically interpreting the predictions generated by ML models [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>In recent developments, large language models (LLMs) have emerged as promising instruments for various psychiatric applications, encompassing diagnostic assessment, risk stratification, and clinical decision support [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. Furthermore, LLMs that enhance chain-of-thought reasoning, such as OpenAI&#x2019;s ChatGPT o1 [<xref ref-type="bibr" rid="ref14">14</xref>], ChatGPT o3-mini [<xref ref-type="bibr" rid="ref15">15</xref>], and Anthropic&#x2019;s Claude 3.7 Sonnet [<xref ref-type="bibr" rid="ref16">16</xref>], have been developed and applied within the medical field to improve diagnostic reasoning. These reasoning-enhanced LLMs have demonstrated potential across various medical specialties, yet their application to predicting antidepressant treatment outcomes remains unexplored [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>Therefore, in this study, we aimed to evaluate whether reasoning-enhanced LLMs could accurately predict 12-week remission among patients with depressive disorder undergoing monotherapy with 1 of 12 different antidepressants, including selective serotonin reuptake inhibitors (SSRIs), serotonin and norepinephrine reuptake inhibitors (SNRIs), or other antidepressants. We also investigated the underlying clinical rationale of these predictions and explored the feasibility of proposing alternative treatment strategies when remission was deemed unlikely.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Participants and Data Preprocessing</title><p>The dataset for this study was obtained from the MAKE Biomarker Discovery for Enhancing Antidepressant Treatment Effect and Response (MAKE BETTER) study [<xref ref-type="bibr" rid="ref22">22</xref>]. Patients with depressive disorders were consecutively recruited from March 2012 to April 2017 at the outpatient psychiatry department of Chonnam National University Hospital. From the initial cohort, 431 patients who received first-step monotherapy were identified. After excluding 9 patients prescribed &#x201C;other&#x201D; medications and 32 lacking blood biomarker data, a total of 390 patients were included in the final analysis.</p><p>Variables assessed included demographic characteristics, personal and familial psychiatric histories, comorbidities, responses to the 9-item Mini-International Neuropsychiatric Interview [<xref ref-type="bibr" rid="ref23">23</xref>], adverse childhood experiences before the age of 16 years (physical, psychological, and sexual abuse), depression subtypes (including melancholic, atypical, and psychotic), and prescribed antidepressants and dosage. Suicidality was assessed using a structured interview comprising 4 standardized questions addressing suicidal thoughts and intent (eg, <italic>&#x201C;</italic>Have you ever felt that life is not worth living?<italic>&#x201D;</italic>). The presence of suicidal ideation determined from these structured questions was subsequently reflected in the Brief Psychiatric Rating Scale [<xref ref-type="bibr" rid="ref24">24</xref>] suicidality item rating. For analysis, only the binary presence or absence of suicidal ideation was used, not the raw Brief Psychiatric Rating Scale score. Additional variables included the Hamilton Depression Rating Scale (HAM-D) [<xref ref-type="bibr" rid="ref25">25</xref>] score, health-related quality of life (EQ-5D) [<xref ref-type="bibr" rid="ref26">26</xref>], functional impairment (Sheehan Disability Scale) [<xref ref-type="bibr" rid="ref27">27</xref>], perceived stress (Perceived Stress Scale) [<xref ref-type="bibr" rid="ref28">28</xref>], resilience (Conner-Davidson Resilience Scale) [<xref ref-type="bibr" rid="ref29">29</xref>], perceived social support (Multidimensional Scale of Perceived Social Support) [<xref ref-type="bibr" rid="ref30">30</xref>], blood biomarkers at baseline, and early treatment response at 2 weeks (&#x2265;20% reduction in HAM-D scores). For female participants, fertility and depression-related factors were evaluated, including age at menarche or menopause, hormonal therapy use, and presence of peri- or postpartum or postmenopausal depression. Further details on eligibility, pharmacotherapy, clinical assessments, and biomarker procedures are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The primary outcome was 12-week remission, defined as an HAM-D score &#x2264;7 sustained through the 12-week assessment point. All analyzed participants were adults, consistent with the validated use of psychiatric assessment tools and pharmacotherapy in adult outpatient clinical practice.</p><p>Numeric coded data were transformed into structured, narrative-style reports in natural language to enhance interpretability by the LLMs, and the comprehensive structure of patient information is depicted in <xref ref-type="other" rid="box1">Textbox 1</xref>.</p><boxed-text id="box1"><title> Structured representation of patient information used for input to the large language models (LLMs). This figure illustrates the structured format of patient information for individuals with major depressive disorder as prepared for LLM input. Each patient&#x2019;s clinical data were inserted into the (patient information) section of the experimental prompt template for subsequent model evaluation.</title><p>(Patient information)</p><list list-type="simple"><list-item><p>(Basic information)</p></list-item><list-item><list list-type="bullet"><list-item><p>Age: xx years</p></list-item><list-item><p>Sex: Male or Female</p></list-item><list-item><p>Height: xxx.x kg</p></list-item><list-item><p>Weight: xx.x kg</p></list-item><list-item><p>Smoking status: Non-smoker, Ex-smoker or Current smoker</p></list-item><list-item><p>Drinking pattern: Non-drinker, E-drinker, or Current drinker</p></list-item><list-item><p>Alcohol Use Disorders Identification Test (AUDIT) score: (For patients who are current drinkers)</p></list-item></list><p>(Female-specific information)</p><list list-type="bullet"><list-item><p>Childbearing potential: Yes or No</p></list-item><list-item><p>Pregnancy experience: Yes or No</p></list-item><list-item><p>Pregnancy during pregnancy: Yes or No</p></list-item><list-item><p>Postpartum depression syndrome: Yes or No</p></list-item><list-item><p>Age at menopause: xx years</p></list-item><list-item><p>Postmenopausal syndrome: Yes or No</p></list-item><list-item><p>Onset of depression at menopause: Yes or No</p></list-item></list><p>(Comorbidities) (All applicable conditions, if any)</p><list list-type="bullet"><list-item><p>Allergic/Immunologic disease, Heart disease, Hypertension, Stroke, Respiratory disease, Dermatologic disease, ear , nose and throat (ENT) disease, Endocrine disease, Ophthalmic disease, Gastrointestinal disease, Genitourinary disease, Hematologic cancer, Solid tumor, Musculoskeletal disease, and/or Neurological/Parkinson disease</p></list-item></list><p>(Depression subtype) (All applicable conditions, if any)</p><list list-type="bullet"><list-item><p>Anxious, Melancholic, Atypical, or Psychotic</p></list-item></list><p>(Monotherapy and 2-week Response)</p><list list-type="bullet"><list-item><p>Main AD (12w): Escitalopram, Paroxetine, Sertraline, Duloxetine, Venlafaxine, Desvenlafaxine, Milnacipran, Miratazapine, Bupropion, Vortioxetine, Tianeptine, or Trazodone</p></list-item><list-item><p>Mean dose (12w): xx.x mg - ADT equivalent dose: (12 w): xx.xxx mg</p></list-item><list-item><p>Early response at 2 wells (&#x2265;20% HAM-D decrease): Yes or No</p></list-item></list><p>(Social-psychological assessments)</p><list list-type="bullet"><list-item><p>HAM-D (Hamilton Depression Rating Scale) total score: xx</p></list-item><list-item><p>EQ-5D (EuroQol-5 Dimension) index: x.xx</p></list-item><list-item><p>SDS (Sheehan Disability Scale) total score: xx</p></list-item><list-item><p>PSS (Perceived Stress Scale) total score: xx</p></list-item><list-item><p>CD-RISC (Connor-Davidson Resilience Scale) total score: xx</p></list-item><list-item><p>MSPSS (Multidimensional Scale of Perceived Social Support) average score: x.xxx</p></list-item></list><p>(Biomarkers)</p><list list-type="bullet"><list-item><p>High-sensitivity C-reactive protein (hs-CRP): xxx mg L</p></list-item><list-item><p>Tumor necrosis factor-alpha (TNF-&#x03B1;): xx.xx pg/mL</p></list-item><list-item><p>Interleukin- 1 beta (IL-1&#x00DF;): x.xx pg/mL</p></list-item><list-item><p>Interleukin-6 (IL-6): x.xxx pg/mL</p></list-item><list-item><p>Interleukin-4 receptor (I-4R): xxxxx pg/mL</p></list-item><list-item><p>Interleukin-10 (I-10): xxxxx pg/mL</p></list-item><list-item><p>Leptin: xx.xx ng/mL</p></list-item><list-item><p>Ghrelin: xxxxx pg/mL</p></list-item><list-item><p>Total Cholesterol: xxx mg/dL</p></list-item><list-item><p>Brain-derived neurotrophic factor (BDNF): xxxx ng/mL</p></list-item></list><p>(Mini-International Neuropsychiatric Interview: MINI) (Yes or No)</p><list list-type="bullet"><list-item><p>Over the past 2 weeks, have you felt depressed or down most of the day, nearly every day?</p></list-item><list-item><p>Over the past 2 weeks, have you experienced a significantly decreased interest or pleasure in most activities or things you usually enjoy?</p></list-item><list-item><p>Have you had a nearly daily decrease or increase in appetite, or an unintentional weight loss or gain (&#x00B1;5% of your body weight in 1 month)? If either is Yes, record Yes.</p></list-item><list-item><p>Have you had insomnia or hypersomnia nearly every day (difficulty falling asleep, trouble staying asleep, early morning awakening, or sleeping too much)?</p></list-item><list-item><p>Have you spoken or moved more slowly than usual, or have you felt restless or unable to sit still nearly every day? If either is Yes, record Yes.</p></list-item><list-item><p>Have you felt fatigue or loss of energy nearly every day?</p></list-item><list-item><p>Have you felt worthless or guilty nearly every day?</p></list-item><list-item><p>Have you had difficulty concentrating or making decisions nearly every day?</p></list-item><list-item><p>Have you had recurrent thoughts of self-harm, suicidal ideation, or a wish for death?</p></list-item></list></list-item></list></boxed-text></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study was approved by the Chonnam National University Hospital Institutional Review Board (CNUH 2012&#x2010;014). Written informed consent was obtained from all participants. For minors, parental permission and child assent would have been required under institutional and national regulations; however, no minors were enrolled in this study.</p></sec><sec id="s2-3"><title>Study Design and Zero-Shot Prompting</title><p>This study follows the Transparent Reporting of a multivariable prediction model for Individual Prognosis or Diagnosis guidelines. The design flow is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>. Initially, we conducted data preprocessing to prepare input for the LLMs. Subsequently, we used 3 reasoning-based LLMs, including ChatGPT o1 and o3-mini (OpenAI) and Claude 3.7 Sonnet (Anthropic), via an application programming interface to predict 12-week remission in patients with depressive disorder, generating clinical rationales for each prediction and treatment strategies for patients anticipated to not achieve remission; each output consisted of 5 distinct sentences.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Methodological framework for LLM-based prediction of 12-wk remission in patients with depressive disorder. This figure depicts the three-phase methodological approach used in this study: (1) data preprocessing of depressive disorder patients with monotherapy (n=390), including transformation from numeric coded data to natural language format; (2) prompting experiment design; and (3) a comprehensive evaluation framework encompassing quantitative, medication-specific, and clinical assessments. AoT: atom-of-thoughts; CoT: chain-of-thoughts; LLM: large language model; NPV: negative predictive value; PPV: positive predictive value; RoD: referencing of deep research.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mental_v13i1e83352_fig01.png"/></fig><p>We conducted zero-shot experiments to assess the performance of these LLMs. OpenAI&#x2019;s models were evaluated across 3 levels of &#x201C;reasoning effort&#x201D; parameters (low, medium, and high), while the Anthropic model was tested at 3 reasoning budget token settings (8000; 16,000; and 32,000 tokens). The detailed structure of the zero-shot prompt is illustrated in <xref ref-type="other" rid="box2">Textbox 2</xref>.</p><boxed-text id="box2"><title> Structure of zero-shot prompt. The prompt message remained consistent across all experiments, with only the (patient information) section being systematically replaced with individual patient data for each experimental case.</title><p>(Zero-shot prompt)</p><p>You are an experienced psychiatrist specializing in depressive disorder. You can access a depressive disorder patient's baseline data, including monotherapy prescribing information and 2-week response.</p><p>Your task:</p><list list-type="order" prefix-word="1"><list-item><p>Predict the depressive disorder patient's 12-week remission as &#x201C;Yes&#x201D; or &#x201C;No.&#x201D;</p></list-item><list-item><p>Provide a &#x201C;Clinical Rationale&#x201D; of exactly five sentences (1~5).</p></list-item><list-item><p>If you predict &#x201C;No,&#x201D; also provide the next &#x201C;Treatment Strategy&#x201D; of exactly five sentences (1~5).</p></list-item><list-item><p>Final Output Format (follow precisely):</p></list-item></list><p>Remission prediction &#x003C;Yes or No&#x003E;</p><p>Clinical Rationale:</p><list list-type="order" prefix-word="1"><list-item><p>...</p></list-item><list-item><p>...</p></list-item><list-item><p>...</p></list-item><list-item><p>...</p></list-item><list-item><p>...</p></list-item></list><p>Treatment Strategy (only if you predict &#x201C;No&#x201D;)</p><list list-type="order" prefix-word="1"><list-item><p>...</p></list-item><list-item><p>...</p></list-item><list-item><p>...</p></list-item><list-item><p>...</p></list-item><list-item><p>...</p></list-item></list><p>Below is the patient's baseline data, including (Basic Information), (including (Female-specific Information) if the patient is female), (Comorbidities), (Mini-International Neuropsychiatric Interview), (Depression Subtype) if present, (Adverse Childhood Experiences (ACEs)) if present, (Depression History &#x0026; Suicidality), (Monotherapy &#x0026; 2-week Response), (Social-Psychological Assessments), and (Biomarkers).</p><p>Please use this data to predict the 12-week remission status (Yes/No) and follow the instructions above.</p><p>(Patient Information):</p></boxed-text><p>The best-performing zero-shot model, based on balanced accuracy, was further evaluated using advanced prompting strategies to enhance reasoning and interpretability. Specifically, the zero-shot chain-of-thought (CoT) prompting method [<xref ref-type="bibr" rid="ref31">31</xref>] and the atom-of-thoughts (AoT) technique [<xref ref-type="bibr" rid="ref32">32</xref>], both of which have shown strong performance on benchmark datasets, were adapted for this study. We also introduced a novel &#x201C;referencing of deep research (RoD)&#x201D; prompting strategy, which leverages OpenAI&#x2019;s deep research [<xref ref-type="bibr" rid="ref33">33</xref>] to generate research reports that are subsequently incorporated into the zero-shot prompt for additional context.</p><p>Finally, our evaluation process comprised multiple sequential phases. First, we conducted a comprehensive quantitative assessment of the zero-shot prompting approaches. Subsequently, using the best-performing model identified through this initial evaluation, we implemented the advanced prompting experiments and subjected them to identical quantitative evaluation methodologies. For the best advanced prompting model, we then performed medication-specific evaluations. Additionally, board-certified medical doctors evaluated the model-generated rationales and treatment strategies.</p></sec><sec id="s2-4"><title>Advanced Prompting</title><p>The zero-shot CoT was implemented by inserting the phrase &#x201C;Let&#x2019;s think step by step&#x201D; immediately before the patient information section in the original zero-shot prompt.</p><p>The AoT technique comprised three phases: decomposition (breaking the question into subquestions), contraction (consolidating into an &#x201C;atomic&#x201D; question), and final solve (generating predictions with rationale, and, if necessary, subsequent treatment strategies). Detailed prompts appear in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><p>Furthermore, we introduced an RoD prompt to incorporate external evidence regarding 12-week remission outcomes. First, we used OpenAI&#x2019;s deep research [<xref ref-type="bibr" rid="ref33">33</xref>] to investigate prior studies on the relationship between our variables of interest and 12-week remission outcomes. A representative dialogue example demonstrating the generation of &#x201C;deep research report&#x201D; is presented in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>A representative dialogue example of &#x201C;deep research report&#x201D; generation. This figure illustrates an example dialogue where the user uses OpenAI&#x2019;s deep research functionality to generate a &#x201C;deep research report.&#x201D; MAKE BETTER study: MAKE Biomarker Discovery for Enhancing Antidepressant Treatment Effect and Response study.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mental_v13i1e83352_fig02.png"/></fig><p>This systematic examination deliberately excluded publications authored by contributors to the MAKE BETTER dataset to mitigate potential confirmation bias and ensure methodological independence in our analysis framework. We then consolidated these findings into a &#x201C;deep research report&#x201D; and subsequently integrated this report into the zero-shot prompt to design the RoD prompt. The prompt for conducting the &#x201C;deep research report&#x201D; and the RoD prompt is presented in <xref ref-type="other" rid="box3">Textbox 3</xref>.</p><boxed-text id="box3"><title> The prompts of deep research and referencing of deep research. The deep research prompt template, used for OpenAI&#x2019;s deep research functionality, instructs the model to function as a research assistant. The hierarchical structure of the patient information is systematically mapped to the prompt parameters, with bracketed &#x201C;[]" items from the (patient information) corresponding to (big category) classifications, while hyphenated &#x201C;-" elements are allocated to (small category) designations. The referencing of the deep research prompt template incorporates outputs from deep research operations into the (deep research report) section, emulating the clinical reasoning process whereby practitioners consult and integrate contemporary research literature before formulating diagnostic conclusions. Sections marked as (omit) indicate portions where identical prompt text from previously described templates has been elided for clarity.</title><p><bold>(Deep Research Prompt)</bold></p><p>You are a highly trained psychiatric research assistant.</p><p>Your goal is to investigate peer-reviewed journal articles about 12-week remission outcomes in depressive disorder patients receiving monotherapy. The following types of monotherapy medications may be considered: escitalopram, paroxetine, sertraline, duloxetine, venlafaxine, desvenlafaxine, milnacipran, mirtazapine, bupropion, vortioxetine, tianeptine, or trazodone.</p><p>You will focus on the following "(Big category)" variables and their possible relationship to remission rates:</p><p>(Small Category)</p><list list-type="bullet"><list-item><p>Age, Sex ... (omitted) ... Homocysteine</p></list-item></list><list list-type="order" prefix-word="1"><list-item><p>Only consider peer-reviewed journal articles.</p></list-item><list-item><p>Exclude any articles by authors involved in the MAKE BETTER dataset.</p></list-item><list-item><p>Summarize your findings for each variable under the specified headings below.</p></list-item><list-item><p>If no evidence is found for a particular variable, write "None."</p></list-item><list-item><p>Provide a brief "Conclusion" section at the end, summarizing your overall findings.</p></list-item></list><p>Use the exact report format shown below, substituting only the bullet points with your findings or &#x201C;None.&#x201D; Then add the final &#x201C;Conclusion&#x201D; after these bullet points.</p><p>(Big Category)</p><list list-type="bullet"><list-item><p>(Small Category)</p></list-item><list-item><p>...</p></list-item></list><p>Conclusion:</p><p><bold>(RoD prompt)</bold></p><p>You are an experienced ... (omitted) ... 2-week response, as well as a deep research report summarizing findings on 12-week remission outcomes for depressive disorder monotherapy.</p><p>Reason as needed, incorporating your own expertise and the research evidence contained in the deep research report below.</p><p>(Deep research report)</p><p>Your task:</p><list list-type="order" prefix-word="1"><list-item><p>... (omitted) ...</p></list-item><list-item><p>... (omitted) ...</p></list-item><list-item><p>... (omitted) ...</p></list-item><list-item><p>Do not copy research text verbatim. Summarize relevant parts like a clinician referencing journal articles.</p></list-item><list-item><p>Final output format (follow precisely): ... (omitted) ...</p></list-item></list></boxed-text><p>The model was instructed to reference rather than directly replicate relevant insights from the &#x201C;deep research report&#x201D; when generating predictions and clinical rationales, thereby emulating the manner in which a practicing clinician would consult and synthesize findings from journal articles.</p></sec><sec id="s2-5"><title>Evaluation</title><p>For the 12-week remission prediction task, we designated &#x201C;yes&#x201D; as the positive class and &#x201C;no&#x201D; as the negative class. We computed balanced accuracy, sensitivity, specificity, positive predictive value (PPV), and negative predictive value (NPV) to compare quantitative performance. Additionally, to evaluate efficiency, we recorded both the inference generation cost and the average generation time (in seconds). The best-performing zero-shot model was selected based on balanced accuracy, reflecting the equal importance of both classes.</p><p>Additionally, we performed benchmarking analyses using logistic regression, random forest, and XGBoost models, evaluated through a patient-level stratified 15% hold-out design with repeated 10&#x00D7;5-fold cross-validation, reporting balanced accuracy, sensitivity, specificity, PPV, and NPV with 95% CIs across random seeds.</p><p>Subsequently, we applied the CoT, AoT, and RoD prompting methods to this best-performing model, compared their final performance using the same metrics, and further examined the medication-specific performance of the model that achieved the highest overall balanced accuracy.</p><p>Finally, 3 evaluators (2 psychiatry residents with &#x003E;2 years of training and 1 psychiatrist specializing in depressive disorder with &#x003E;10 y of experience) independently reviewed the clinical rationales and following treatment strategies generated for the correctly predicted cases by the best-performing model. They assessed these outputs across 5 domains (consistency, correctness, specificity, helpfulness, and human likeness) using a 5-point rating scale [<xref ref-type="bibr" rid="ref34">34</xref>]. Consistency measured how closely the generated text aligned with the predicted answers, correctness evaluated its medical accuracy, specificity assessed its level of detail, helpfulness examined its clinical use, and human likeness considered how similar it was to typical human judgment.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Baseline Demographics and Clinical Characteristics</title><p><xref ref-type="table" rid="table1">Table 1</xref> summarizes the baseline demographics and clinical characteristics across different monotherapy groups. The study population consisted of 244 patients prescribed SSRIs (escitalopram: n=159, 65%; paroxetine: n=60, 25%; and sertraline: n=25, 10%), 33 patients receiving SNRIs (duloxetine: n=20, 61%; venlafaxine: n=10, 30%; desvenlafaxine: n=2, 6%; and milnacipran: n=1, 3%), 99 patients on mirtazapine, 9 patients prescribed bupropion, and 5 patients taking other antidepressants (vortioxetine: n=3, 60%; tianeptine: n=1, 20%; and trazodone: n=1, 20%).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Baseline demographics and clinical characteristics of preprocessed patients with depressive disorder, stratified according to the types of prescribed monotherapy (n=390).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom" colspan="5">Types of prescribed monotherapy</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">SSRI<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup><break/>(n=244)</td><td align="left" valign="top">SNRI<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup><break/>(n=33)</td><td align="left" valign="top">Mirtazapine<break/>(n=99)</td><td align="left" valign="top">Bupropion<break/>(n=9)</td><td align="left" valign="top">Others<break/>(n=5)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="6">Sex, n (%)</td></tr><tr><td align="left" valign="top">&#x2003;Female</td><td align="left" valign="top">175 (72)</td><td align="left" valign="top">25 (76)</td><td align="left" valign="top">77 (78)</td><td align="left" valign="top">4 (44)</td><td align="left" valign="top">4 (80)</td></tr><tr><td align="left" valign="top">&#x2003;Male</td><td align="left" valign="top">69 (28)</td><td align="left" valign="top">8 (24)</td><td align="left" valign="top">22 (22)</td><td align="left" valign="top">5 (56)</td><td align="left" valign="top">1 (20)</td></tr><tr><td align="left" valign="top" colspan="6">Employment status, n (%)</td></tr><tr><td align="left" valign="top">&#x2003;Yes</td><td align="left" valign="top">180 (74)</td><td align="left" valign="top">25 (76)</td><td align="left" valign="top">67 (68)</td><td align="left" valign="top">7 (78)</td><td align="left" valign="top">4 (80)</td></tr><tr><td align="left" valign="top">&#x2003;No</td><td align="left" valign="top">64 (26)</td><td align="left" valign="top">8 (24)</td><td align="left" valign="top">32 (32)</td><td align="left" valign="top">2 (22)</td><td align="left" valign="top">1 (20)</td></tr><tr><td align="left" valign="top" colspan="6">Living alone, n (%)</td></tr><tr><td align="left" valign="top">&#x2003;Yes</td><td align="left" valign="top">41 (17)</td><td align="left" valign="top">2 (6)</td><td align="left" valign="top">17 (17)</td><td align="left" valign="top">2 (22)</td><td align="left" valign="top">3 (60)</td></tr><tr><td align="left" valign="top">&#x2003;No</td><td align="left" valign="top">203 (83)</td><td align="left" valign="top">31 (94)</td><td align="left" valign="top">82 (83)</td><td align="left" valign="top">7 (78)</td><td align="left" valign="top">2 (40)</td></tr><tr><td align="left" valign="top" colspan="6">12-week remission, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Yes</td><td align="left" valign="top">83 (34)</td><td align="left" valign="top">13 (39)</td><td align="left" valign="top">42 (42)</td><td align="left" valign="top">3 (33)</td><td align="left" valign="top">1 (20)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>No</td><td align="left" valign="top">161 (66)</td><td align="left" valign="top">20 (61)</td><td align="left" valign="top">57 (58)</td><td align="left" valign="top">6 (67)</td><td align="left" valign="top">4 (80)</td></tr><tr><td align="left" valign="top">Age (y), mean (SD)</td><td align="left" valign="top">56.8 (14.5)</td><td align="left" valign="top">58.4 (9.5)</td><td align="left" valign="top">60.4 (14.1)</td><td align="left" valign="top">46.4 (14.6)</td><td align="left" valign="top">58.6 (8.0)</td></tr><tr><td align="left" valign="top">Height (cm), mean (SD)</td><td align="left" valign="top">159.9 (8.9)</td><td align="left" valign="top">157.3 (8.1)</td><td align="left" valign="top">159.1 (7.7)</td><td align="left" valign="top">165.1 (6.6)</td><td align="left" valign="top">157.9 (8.7)</td></tr><tr><td align="left" valign="top">Weight (kg), mean (SD)</td><td align="left" valign="top">59.7 (10.5)</td><td align="left" valign="top">58.3 (9.2)</td><td align="left" valign="top">59.4 (9.7)</td><td align="left" valign="top">59.0 (11.8)</td><td align="left" valign="top">60.7 (8.5)</td></tr><tr><td align="left" valign="top">HAM-D<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup>, mean (SD)</td><td align="left" valign="top">20.4 (4.1)</td><td align="left" valign="top">20.5 (4.1)</td><td align="left" valign="top">21.2 (3.9)</td><td align="left" valign="top">18.6 (4.7)</td><td align="left" valign="top">22.2 (4.4)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>SSRI: selective serotonin reuptake inhibitor.</p></fn><fn id="table1fn2"><p><sup>b</sup>SNRI: serotonin and norepinephrine reuptake inhibitor.</p></fn><fn id="table1fn3"><p><sup>c</sup>HAM-D: the Hamilton Depression Rating Scale.</p></fn></table-wrap-foot></table-wrap><p>Among the total cohort (n=390), female participants constituted the majority (285/390, 73%), with similar gender distribution across the SSRIs (175/244, 72%), SNRIs (25/33, 76%), and mirtazapine groups (77/99, 78%). Employment was reported by 74% (180/244) of SSRI users, 76% (25/33) of SNRI users, and 68% (67/99) of mirtazapine users. At the 12-week assessment, 34% (83/244) of SSRI users, 39% (13/33) of SNRI users, and 42% (42/99) of mirtazapine users achieved remission. The mean baseline HAM-D scores ranged from 18.6 (SD 4.7) to 22.2 (4.4) points, with participants in the mirtazapine group being slightly older (mean 60.4, SD 14.1 y) than those in the bupropion group (mean 46.4, SD 14.6 y).</p></sec><sec id="s3-2"><title>Performance of Zero-Shot Prompting</title><p>The zero-shot performance section of <xref ref-type="table" rid="table2">Table 2</xref> delineates the comparative outcomes of zero-shot experiments conducted with OpenAI&#x2019;s ChatGPT o1 and o3-mini models across 3 distinct levels of reasoning effort, namely &#x201C;low,&#x201D; &#x201C;medium,&#x201D; and &#x201C;high,&#x201D; as well as for Anthropic&#x2019;s Claude 3.7 Sonnet under 3 varying token budget settings (8000; 16,000; and 32,000 tokens). The findings indicate that all models demonstrated sensitivity values ranging from 0.6690 to 0.9085, suggesting that a significant proportion of patients who achieved remission were accurately identified. Conversely, specificity, which measures the correct identification of patients who did not achieve remission, exhibited lower values, ranging from 0.3185 to 0.6331 across the evaluated LLMs.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Quantitative performance of zero-shot and advanced prompting techniques across 390 samples, including balanced accuracy, sensitivity, specificity, PPV,<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> and NPV<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="3">Prompting, models, and<break/>reasoning<break/>parameters</td><td align="left" valign="bottom">Balanced<break/>accuracy</td><td align="left" valign="bottom">Sensitivity</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom">PPV</td><td align="left" valign="bottom">NPV</td><td align="left" valign="bottom">Time per generation<break/>(s)</td><td align="left" valign="bottom">Total cost (US $)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="10">Zero-shot</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ChatGPT o1</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Low</td><td align="char" char="." valign="top">0.6135</td><td align="char" char="." valign="top">0.9085</td><td align="char" char="." valign="top">0.3185</td><td align="char" char="." valign="top">0.4329</td><td align="char" char="." valign="top">0.8587</td><td align="char" char="." valign="top">11.44</td><td align="char" char="." valign="top">22.36</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Medium</td><td align="char" char="." valign="top">0.6382</td><td align="char" char="." valign="top">0.9014</td><td align="char" char="." valign="top">0.3750</td><td align="char" char="." valign="top">0.4523</td><td align="char" char="." valign="top">0.8692</td><td align="char" char="." valign="top">19.63</td><td align="char" char="." valign="top">35.20</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>High</td><td align="char" char="." valign="top">0.6333</td><td align="char" char="." valign="top">0.8592</td><td align="char" char="." valign="top">0.4073</td><td align="char" char="." valign="top">0.4535</td><td align="char" char="." valign="top">0.8347</td><td align="char" char="." valign="top">30.08</td><td align="char" char="." valign="top">53.07</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>ChatGPT o3-mini</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Low</td><td align="char" char="." valign="top">0.6121</td><td align="char" char="." valign="top">0.8169</td><td align="char" char="." valign="top">0.4073</td><td align="char" char="." valign="top">0.4411</td><td align="char" char="." valign="top">0.7953</td><td align="char" char="." valign="top">4.84</td><td align="char" char="." valign="top">1.14</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Medium</td><td align="char" char="." valign="top">0.6091</td><td align="char" char="." valign="top">0.8028</td><td align="char" char="." valign="top">0.4153</td><td align="char" char="." valign="top">0.4402</td><td align="char" char="." valign="top">0.7863</td><td align="char" char="." valign="top">8.89</td><td align="char" char="." valign="top">2.00</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>High</td><td align="char" char="." valign="top">0.6323</td><td align="char" char="." valign="top">0.8169</td><td align="char" char="." valign="top">0.4476</td><td align="char" char="." valign="top">0.4585</td><td align="char" char="." valign="top">0.8102</td><td align="char" char="." valign="top">20.43</td><td align="char" char="." valign="top">4.39</td></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Claude 3.7 Sonnet</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="char" char="." valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>8000</td><td align="char" char="." valign="top">0.6349</td><td align="char" char="." valign="top">0.6972</td><td align="char" char="." valign="top">0.5726</td><td align="char" char="." valign="top">0.4829</td><td align="char" char="." valign="top">0.7676</td><td align="char" char="." valign="top">22.23</td><td align="char" char="." valign="top">9.81</td></tr><tr><td align="char" char="." valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>16,000</td><td align="char" char="." valign="top">0.6511</td><td align="char" char="." valign="top">0.6690</td><td align="char" char="." valign="top">0.6331</td><td align="char" char="." valign="top">0.5108</td><td align="char" char="." valign="top">0.7696</td><td align="char" char="." valign="top">23.78</td><td align="char" char="." valign="top">10.90</td></tr><tr><td align="char" char="." valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>32,000</td><td align="char" char="." valign="top">0.6656</td><td align="char" char="." valign="top">0.7183</td><td align="char" char="." valign="top">0.6129</td><td align="char" char="." valign="top">0.5152</td><td align="char" char="." valign="top">0.7917</td><td align="char" char="." valign="top">26.84</td><td align="char" char="." valign="top">11.58</td></tr><tr><td align="left" valign="top" colspan="10">Zero-shot CoT<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td></tr><tr><td align="char" char="." valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Claude 3.7 Sonnet with 32,000 tokens</td><td align="char" char="." valign="top">0.6319</td><td align="char" char="." valign="top">0.6549</td><td align="char" char="." valign="top">0.6089</td><td align="char" char="." valign="top">0.4895</td><td align="char" char="." valign="top">0.7550</td><td align="char" char="." valign="top">27.24</td><td align="char" char="." valign="top">12.13</td></tr><tr><td align="left" valign="top" colspan="10">Zero-shot AoT<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td></tr><tr><td align="char" char="." valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Claude 3.7 Sonnet with 32,000 tokens</td><td align="char" char="." valign="top">0.6522</td><td align="char" char="." valign="top">0.4859</td><td align="char" char="." valign="top">0.8185</td><td align="char" char="." valign="top">0.6053</td><td align="char" char="." valign="top">0.7355</td><td align="char" char="." valign="top">126.92</td><td align="char" char="." valign="top">57.56</td></tr><tr><td align="left" valign="top" colspan="10">Zero-shot RoD<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td></tr><tr><td align="char" char="." valign="top" colspan="3"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Claude 3.7 Sonnet with 32,000 tokens</td><td align="char" char="." valign="top">0.6697</td><td align="char" char="." valign="top">0.7183</td><td align="char" char="." valign="top">0.6210</td><td align="char" char="." valign="top">0.5204</td><td align="char" char="." valign="top">0.7938</td><td align="char" char="." valign="top">43.88</td><td align="char" char="." valign="top">39.56</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>PPV: positive predictive value.</p></fn><fn id="table2fn2"><p><sup>b</sup>NPV: negative predictive value.</p></fn><fn id="table2fn3"><p><sup>c</sup>CoT: chain-of-thoughts.</p></fn><fn id="table2fn4"><p><sup>d</sup>AoT: atom-of-thoughts.</p></fn><fn id="table2fn5"><p><sup>e</sup>RoD: referencing of deep research.</p></fn></table-wrap-foot></table-wrap><p>As the reasoning effort increased, all 3 models showed enhancements in both specificity and balanced accuracy. Specifically, the ChatGPT o1 model&#x2019;s specificity improved from 0.3185 to 0.4073, with balanced accuracy rising from 0.6135 to 0.6333. Similarly, the ChatGPT o3-mini model experienced an increase in specificity from 0.4073 to 0.4476, alongside an improvement in balanced accuracy from 0.6121 to 0.6323. The Claude 3.7 Sonnet model also demonstrated an increase in specificity from 0.5726 to 0.6129, with a modest rise in balanced accuracy from 0.6349 to 0.6656.</p><p>From a computational efficiency standpoint, an increase in reasoning level generally resulted in heightened time and cost requirements across all models. Across all models evaluated, ChatGPT o1 incurred the highest overall costs, with total expenses ranging from $22.36 to $53.07. In contrast, ChatGPT o3-mini emerged as the most cost-effective option, with total costs between $1.14 and $4.39, rendering it the least expensive model. Furthermore, ChatGPT o3-mini exhibited superior speed efficiency, with task completion times ranging from 4.84 to 20.43 seconds, outperforming the other models in computational efficiency.</p><p>Conversely, Claude 3.7 Sonnet maintained a relatively stable computational profile across varying token budgets, with task completion times ranging from 22.23 seconds at the 8000-token setting to 26.84 seconds at the 32,000-token setting, and total costs increasing modestly from $9.81 to $11.58. Despite requiring more time per task than ChatGPT o3-mini at lower settings, Claude 3.7 Sonnet&#x2019;s costs remained significantly lower than those of ChatGPT o1 at higher reasoning levels, while achieving the best overall performance, as evidenced by its balanced accuracy of 0.6656 at the 32,000-token reasoning budget. The detailed confusion matrices for all zero-shot prompting experiments are presented in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p></sec><sec id="s3-3"><title>Performance of Advanced Prompting</title><p>The advanced prompting (zero-shot CoT, AoT, and RoD) performance section of <xref ref-type="table" rid="table2">Table 2</xref> outlines the performance metrics of 3 advanced prompt strategies applied to the Claude 3.7 Sonnet model using a 32,000-token reasoning budget, which demonstrated the best performance in the zero-shot context.</p><p>Among the advanced prompt strategies, the zero-shot CoT exhibited a balanced accuracy of 0.6319, with sensitivity and specificity values of 0.6549 and 0.6089, respectively, alongside a PPV of 0.4895 and an NPV of 0.7550. This performance is marginally lower than that of Claude 3.7 Sonnet&#x2019;s zero-shot approach, particularly in terms of sensitivity and balanced accuracy.</p><p>The AoT strategy demonstrated a balanced accuracy of 0.6522, with a sensitivity of 0.4859 and a specificity of 0.8185. Its PPV and NPV were recorded at 0.6053 and 0.7355, respectively, while the time per task reached 126.92 seconds, and total costs escalated to $57.56, indicating a significant increase in computational resource demands compared to the zero-shot approach of Claude 3.7 Sonnet.</p><p>In contrast, the RoD approach achieved the highest balanced accuracy among the advanced prompts at 0.6697, with a sensitivity of 0.7183 and a specificity of 0.6210, slightly surpassing the performance of Claude 3.7 Sonnet&#x2019;s zero-shot method. However, RoD&#x2019;s time per task was approximately 1.63 times greater, and its total cost was approximately 3.42 times that of the zero-shot setting. The detailed confusion matrices for all advanced prompting experiments are presented in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p><p>For reference, conventional ML models trained on the numerically coded dataset achieved balanced accuracies ranging from 0.6077 to 0.7371 and sensitivities from 0.3533 to 0.6364 with overlapping 95% CIs (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>).</p></sec><sec id="s3-4"><title>Medication-Specific Performance</title><p><xref ref-type="table" rid="table3">Table 3</xref> presents the performance metrics for the RoD strategy across various antidepressants, including SSRIs (escitalopram, paroxetine, and sertraline), SNRIs (duloxetine, venlafaxine, desvenlafaxine, and milnacipran), mirtazapine, bupropion, and others (vortioxetine, tianeptine, and trazodone), along with the number of correct predictions for both remission and nonremission outcomes. Among antidepressants with more than 50 cases, escitalopram (n=159), mirtazapine (n=99), and paroxetine (n=60) achieved balanced accuracies of 0.6799, 0.6873, and 0.6375, respectively.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Quantitative performance of RoD<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> prompting by medications, applied to Claude 3.7 Sonnet configured with 32,000 reasoning budget tokens.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Medications</td><td align="left" valign="bottom">Balanced accuracy</td><td align="left" valign="bottom">Sensitivity</td><td align="left" valign="bottom">Specificity</td><td align="left" valign="bottom">PPV<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="bottom">NPV<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="bottom">Correct predictions (yes), n/N</td><td align="left" valign="bottom">Correct predictions<break/>(no), n/N</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="9">SSRI<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Escitalopram</td><td align="left" valign="top">0.6799</td><td align="left" valign="top">0.7407</td><td align="left" valign="top">0.6190</td><td align="left" valign="top">0.5000</td><td align="left" valign="top">0.8228</td><td align="left" valign="top">40/54</td><td align="left" valign="top">65/105</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Paroxetine</td><td align="left" valign="top">0.6375</td><td align="left" valign="top">0.8000</td><td align="left" valign="top">0.4750</td><td align="left" valign="top">0.4324</td><td align="left" valign="top">0.8261</td><td align="left" valign="top">16/20</td><td align="left" valign="top">19/40</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sertraline</td><td align="left" valign="top">0.7083</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.7500</td><td align="left" valign="top">0.6000</td><td align="left" valign="top">0.8000</td><td align="left" valign="top">6/9</td><td align="left" valign="top">12/16</td></tr><tr><td align="left" valign="top" colspan="9">SNRI<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Duloxetine</td><td align="left" valign="top">0.6190</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.5714</td><td align="left" valign="top">0.4000</td><td align="left" valign="top">0.8000</td><td align="left" valign="top">4/6</td><td align="left" valign="top">8/14</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Venlafaxine</td><td align="left" valign="top">0.7083</td><td align="left" valign="top">0.7500</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.6000</td><td align="left" valign="top">0.8000</td><td align="left" valign="top">3/4</td><td align="left" valign="top">4/6</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Desvenlafaxine</td><td align="left" valign="top">0.5000</td><td align="left" valign="top">1.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">1.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">2/2</td><td align="left" valign="top">0/0</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Milnacipran</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">0/1</td><td align="left" valign="top">0/0</td></tr><tr><td align="left" valign="top" colspan="2">Mirtazapine</td><td align="left" valign="top">0.6873</td><td align="left" valign="top">0.6905</td><td align="left" valign="top">0.6842</td><td align="left" valign="top">0.6170</td><td align="left" valign="top">0.7500</td><td align="left" valign="top">29/42</td><td align="left" valign="top">39/57</td></tr><tr><td align="left" valign="top" colspan="2">Bupropion</td><td align="left" valign="top">0.7500</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.8333</td><td align="left" valign="top">0.6667</td><td align="left" valign="top">0.8333</td><td align="left" valign="top">2/3</td><td align="left" valign="top">5/6</td></tr><tr><td align="left" valign="top" colspan="9">Others</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Vortioxetine</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">0/1</td><td align="left" valign="top">0/2</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Tianeptine</td><td align="left" valign="top">0.5000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">1.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">1.0000</td><td align="left" valign="top">0/0</td><td align="left" valign="top">1/1</td></tr><tr><td align="left" valign="top" colspan="2"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Trazodone</td><td align="left" valign="top">0.5000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">1.0000</td><td align="left" valign="top">0.0000</td><td align="left" valign="top">1.0000</td><td align="left" valign="top">0/0</td><td align="left" valign="top">1/1</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>RoD: referencing of deep research.</p></fn><fn id="table3fn2"><p><sup>b</sup>PPV: positive predictive value.</p></fn><fn id="table3fn3"><p><sup>c</sup>NPV: negative predictive value.</p></fn><fn id="table3fn4"><p><sup>d</sup>SSRI: selective serotonin reuptake inhibitor.</p></fn><fn id="table3fn5"><p><sup>e</sup>SNRI: serotonin and norepinephrine reuptake inhibitor.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5"><title>Medical Doctor Evaluation of Model-Generated Rationales and Treatment Strategies</title><p>A total of 3 clinical evaluators independently assessed the clinical rationales and treatment strategies generated by the best-performing model for 256 correctly predicted cases. As presented in <xref ref-type="table" rid="table4">Table 4</xref>, the highest total rating was observed for correctness (mean, 4.3, SD 0.7). Consistency, specificity, and helpfulness also received favorable evaluations (means 4.2, 4.2, and 4.2, respectively). Human likeness received the lowest but still positive rating (mean 3.6, SD 1.7). Notably, the board-certified psychiatrist rated helpfulness highest (mean 4.5, SD 0.6), while consistency scores varied most between evaluators, ranging from a mean of 3.4 to 4.9. To demonstrate the interpretability of the model&#x2019;s reasoning process, one representative remission case (&#x201C;yes&#x201D;) and one nonremission case (&#x201C;no&#x201D;) were selected as examples, each accompanied by psychiatrist evaluations and comments. These illustrative cases are presented in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendices 6</xref> and <xref ref-type="supplementary-material" rid="app7">7</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Evaluations on clinical rationales and treatment strategies assigned by a board-certified psychiatrist and psychiatry residents for the clinical outputs produced by the best model across 256 correctly predicted cases.<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Consistency, mean (SD)</td><td align="left" valign="bottom">Correctness, mean (SD)</td><td align="left" valign="bottom">Specificity, mean (SD)</td><td align="left" valign="bottom">Helpfulness, mean (SD)</td><td align="left" valign="bottom">Human likeness, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Psychiatrist</td><td align="left" valign="top">3.4 (0.6)</td><td align="left" valign="top">4.3 (0.5)</td><td align="left" valign="top">4.0 (0.5)</td><td align="left" valign="top">4.5 (0.6)</td><td align="left" valign="top">3.5 (0.5)</td></tr><tr><td align="left" valign="top">Resident 1</td><td align="left" valign="top">4.3 (0.5)</td><td align="left" valign="top">4.4 (0.7)</td><td align="left" valign="top">4.2 (0.6)</td><td align="left" valign="top">4.3 (0.7)</td><td align="left" valign="top">3.9 (2.6)</td></tr><tr><td align="left" valign="top">Resident 2</td><td align="left" valign="top">4.9 (0.4)</td><td align="left" valign="top">4.2 (0.8)</td><td align="left" valign="top">4.3 (0.8)</td><td align="left" valign="top">3.9 (1.3)</td><td align="left" valign="top">3.4 (1.2)</td></tr><tr><td align="left" valign="top">Residents<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">4.6 (0.5)</td><td align="left" valign="top">4.3 (0.7)</td><td align="left" valign="top">4.3 (0.7)</td><td align="left" valign="top">4.1 (1.1)</td><td align="left" valign="top">3.6 (2.0)</td></tr><tr><td align="left" valign="top">Total<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">4.2 (0.8)</td><td align="left" valign="top">4.3 (0.7)</td><td align="left" valign="top">4.2 (0.7)</td><td align="left" valign="top">4.2 (1.0)</td><td align="left" valign="top">3.6 (1.7)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Assessments were conducted across 5 domains using a 5-point scale (1-5), with higher scores indicating better performance. </p></fn><fn id="table4fn2"><p><sup>b</sup>The "residents" row represents the aggregated scores from both residents.</p></fn><fn id="table4fn3"><p><sup>c</sup>"Total" indicates the combined assessment across all 3 evaluators.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Reasoning-based LLMs, especially when guided by research-informed prompting strategies, demonstrate promising potential in predicting antidepressant treatment response among patients with depressive disorder. To the best of our knowledge, this is among the first applications of LLMs for forecasting remission outcomes in depression, extending beyond prior approaches that primarily used traditional statistical and ML models [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>].</p><p>In zero-shot contexts, all models showed higher sensitivity (0.6690&#x2010;0.9085) than specificity (0.3185&#x2010;0.6331). Balanced accuracy improved with enhanced reasoning: ChatGPT o1 by 3.22%, ChatGPT o3-mini by 3.3%, and Claude 3.7 Sonnet by 4.8%, with Claude achieving the highest performance (0.6656) at 32,000 budget tokens. This supports prior findings on reasoning capabilities&#x2019; importance in medical applications [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>], suggesting that enhanced reasoning depth improves LLM performance in specific clinical tasks. Moreover, our proposed RoD technique, which emulates how clinicians incorporate contemporary research findings into their clinical reasoning process, outperformed zero-shot CoT and AoT with highest balanced accuracy (0.6697). While requiring further research, RoD appears effective for psychiatric prediction tasks. Compared with conventional ML baselines (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>), which achieved balanced accuracies of 0.6077 to 0.7371 and sensitivities of 0.3533 to 0.6364, our reasoning-based LLM approach demonstrated higher sensitivity, indicating improved identification of patients who ultimately achieved remission. Analyzing medication-specific performance after excluding antidepressants with fewer than 10 cases, NPV remained high (&#x003E;0.75) across all medications. For escitalopram, which was the most frequently prescribed antidepressant in the cohort (n=159), the RoD prompting approach achieved a balanced accuracy of 0.6799. Although direct comparison is limited by differences in sample size and methodology, this value is numerically higher than the 0.61 balanced accuracy reported in a prior partial least squares regression analysis of 92 escitalopram-treated patients [<xref ref-type="bibr" rid="ref36">36</xref>], suggesting that reasoning-based LLMs may achieve comparable or potentially improved predictive capability within a single antidepressant group.</p><p>A particularly noteworthy finding is the contrasting performance between traditional reasoning approaches (CoT/AoT) and our knowledge-augmented RoD strategy. While CoT and AoT showed minimal improvement or even slight performance degradation compared to zero-shot prompting, RoD achieved consistent improvements across all metrics. This divergence suggests that for clinical pattern-recognition tasks, the decomposition of reasoning steps alone (as in CoT/AoT) may introduce unnecessary complexity without meaningful benefit. In contrast, RoD&#x2019;s incorporation of synthesized research evidence appears to provide crucial contextual priors that enhance prediction accuracy. This mirrors actual clinical practice, where psychiatrists integrate empirical evidence from literature with patient-specific data rather than relying solely on sequential logical reasoning.</p><p>The superior performance of RoD likely stems from its ability to leverage documented patterns in depressive disorder treatment outcomes, effectively providing the model with a knowledge base of established clinical associations. This approach compensates for the inherent limitations of LLMs in medical domains, where training data may not adequately capture the full spectrum of clinical scenarios. Furthermore, by grounding predictions in research evidence, RoD may reduce the risk of hallucinations or clinically implausible outputs that can occur with pure reasoning approaches, a critical concern in medical artificial intelligence (AI) applications [<xref ref-type="bibr" rid="ref37">37</xref>]. These findings align with recent evidence suggesting that retrieval-augmented approaches enhance LLM reliability in clinical contexts [<xref ref-type="bibr" rid="ref38">38</xref>]. The hybrid strategy combining LLM reasoning with structured knowledge integration may represent an optimal approach for clinical prediction tasks, particularly in psychiatry, where outcomes are influenced by complex biopsychosocial factors [<xref ref-type="bibr" rid="ref39">39</xref>].</p></sec><sec id="s4-2"><title>Clinical Implications</title><p>Clinical evaluation of the model-generated rationales and treatment suggestions revealed high ratings for correctness, consistency, specificity, and perceived helpfulness, indicating that reasoning-based LLMs can produce clinically coherent and contextually relevant outputs. Favorable assessments by practicing clinicians further suggest their potential as valuable adjuncts in real-world clinical decision-making, particularly for the early identification of patients at risk of treatment nonremission. Unlike prior models focused mainly on predictive performance, our approach emphasizes interpretability and clinician usability, which are key elements for real-world application. By integrating biomarker and clinical data with advanced reasoning, LLMs may support more personalized and effective treatment decisions. Nonetheless, relatively lower ratings for human likeness highlight the need for improved communication style to foster trust and interpretability in clinical practice.</p><p>The high NPV (&#x003E;0.75) across all medication classes suggests particular utility as a screening tool to identify patients unlikely to achieve remission with standard first-line treatments. This could enable a stratified care approach, where predicted nonresponders receive enhanced monitoring, earlier treatment adjustments, or augmentation strategies, potentially reducing the typical 12-week trial-and-error period. Such implementation aligns with recent frameworks for integrating AI into clinical psychiatry that emphasize augmentation rather than replacement of clinical judgment [<xref ref-type="bibr" rid="ref40">40</xref>]. The RoD prompting strategy required an average processing time of 43.88 seconds per patient, suggesting that real-time clinical application is feasible within standard consultation time frames.</p><p>From a health economics perspective, early identification of nonresponders could substantially reduce costs associated with prolonged ineffective treatments, emergency interventions, and productivity losses. The ability to provide detailed clinical rationales distinguishes our approach from black-box algorithms, addressing a critical barrier to AI adoption in psychiatry, where understanding the reasoning behind recommendations is essential for clinical acceptance and regulatory approval [<xref ref-type="bibr" rid="ref41">41</xref>]. Moreover, the cloud-based nature of LLMs enables deployment without specialized hardware, making this technology accessible to resource-limited settings where psychiatric expertise may be scarce [<xref ref-type="bibr" rid="ref42">42</xref>].</p><p>Successful clinical implementation would require integration with electronic health records, development of user-friendly interfaces, and establishment of clear protocols for acting on model predictions. The model&#x2019;s ability to suggest alternative treatment strategies when predicting nonremission provides actionable guidance rather than mere risk stratification, potentially improving clinical utility. Furthermore, the transparent reasoning process could serve an educational function, helping less experienced clinicians understand factors influencing treatment response and potentially improving their clinical reasoning skills over time [<xref ref-type="bibr" rid="ref43">43</xref>]. Prospective validation studies are warranted to confirm these findings in real-world clinical settings.</p></sec><sec id="s4-3"><title>Limitations</title><p>Despite promising findings, several limitations warrant consideration. First, while our approach demonstrated robust sensitivity (0.7183) and NPV (0.7938), the relatively low PPV (0.5204) may generate false positives, potentially complicating treatment planning for patients misclassified as achieving remission [<xref ref-type="bibr" rid="ref44">44</xref>]. The relatively modest PPV observed in our model should be interpreted in light of the low remission prevalence in our cohort, a condition known to constrain PPV despite adequate discriminative performance. Although PPV was modest, the model demonstrated balanced accuracy and sensitivity at clinically meaningful levels, supporting its capacity for reliable risk stratification in a heterogeneous depressive population. Importantly, the high NPV suggests that the model may be particularly effective for identifying patients unlikely to achieve remission, thereby enabling early treatment modifications or augmentation strategies to improve outcomes. These findings emphasize that the model is intended as an adjunctive decision-support tool, and its predictions should be integrated with comprehensive clinical assessments.</p><p>Medication-specific analyses revealed sample imbalances (<xref ref-type="table" rid="table3">Table 3</xref>), with escitalopram dominating (n=159) and several medications having fewer than 20 cases. Although overall model performance remained robust, medication-specific metrics should be interpreted with caution for drugs with limited samples. This imbalance reflects real-world prescribing patterns but limits our ability to make definitive conclusions about model performance for less commonly prescribed antidepressants [<xref ref-type="bibr" rid="ref45">45</xref>]. Future studies should either focus on medications with adequate sample sizes or use targeted recruitment strategies to ensure sufficient representation across all medication classes.</p><p>Our clinical evaluation methodology has notable limitations. The assessment was conducted by only 3 evaluators from a single institution, potentially introducing institutional bias and limiting generalizability. More critically, evaluation was restricted to correctly predicted cases, which likely inflates perceived quality scores and fails to capture model behavior in misclassification scenarios. Future studies should incorporate multi-institutional evaluators and a comprehensive assessment of both correct and incorrect predictions to provide more robust validation of AI-assisted diagnostic approaches.</p><p>Finally, the RoD method requires further comparative evaluation against alternative knowledge-augmented techniques to determine its optimal application in psychiatric contexts. Validation in ethnically diverse populations with larger numbers of clinical expert appraisals remains essential. Prospective randomized trials are needed to evaluate whether model recommendations improve clinical outcomes and decision-making in practice.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In conclusion, this study demonstrates the promising potential of reasoning-based LLMs for predicting antidepressant treatment response in patients with depressive disorder. Our findings highlight the superior performance of the RoD technique, which achieved the highest performance by integrating research evidence with clinical reasoning, representing an important advance toward AI-assisted clinical decision support in psychiatry. The high NPV (&#x003E;0.75) across medications suggests particular use as a screening tool for identifying patients unlikely to achieve remission with standard treatments. While limitations exist, including the need for validation in diverse populations and larger-scale clinical evaluations, the positive assessment by clinical experts validates the potential use of these approaches. Future research should focus on expanding real-world treatment outcome datasets, conducting multi-institutional clinical evaluations, and developing models that can predict both the magnitude of treatment response and suggest personalized next-step strategies. These advances could enable clinicians to make more informed, evidence-based decisions in selecting the most effective personalized treatment strategies for patients with depressive disorder.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This research was supported by the Bio &#x0026; Medical Technology Development Program and of the National Research Foundation funded by the Korean government (Ministry of Science and ICT) (RS-2024-00440371 and RS-2024-00457381).</p></sec><sec><title>Data Availability</title><p>The data that support the findings of the study are available from Jae-Min Kim upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>JMK had full access to all of the data in the study and takes responsibility for the integrity of the data and the accuracy of the data analysis</p><p>Concept and design: JMK, HL, HJK, JHP</p><p>Acquisition, analysis, and interpretation of data: All authors</p><p>Manuscript drafting: JMK, HL, HJK, JHP</p><p>Critical review of the manuscript for important intellectual content: All authors</p><p>Statistical analysis: JMK, HL, HJK, JHP</p><p>Funding: JMK and HL</p><p>Administrative, technical, or material support: JMK, HJK, JHJ, SGK, JWK</p><p>Supervision: All authors</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AoT</term><def><p>atom-of-thoughts</p></def></def-item><def-item><term id="abb3">CoT</term><def><p>chain-of-thoughts</p></def></def-item><def-item><term id="abb4">HAM-D</term><def><p>Hamilton Depression Rating Scale</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">MAKE BETTER study</term><def><p>MAKE Biomarker Discovery for Enhancing Antidepressant Treatment Effect and Response study</p></def></def-item><def-item><term id="abb7">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb8">NPV</term><def><p>negative predictive value</p></def></def-item><def-item><term id="abb9">PPV</term><def><p>positive predictive value</p></def></def-item><def-item><term id="abb10">RoD</term><def><p>referencing of deep research</p></def></def-item><def-item><term id="abb11">SNRI</term><def><p>serotonin and norepinephrine reuptake inhibitor</p></def></def-item><def-item><term id="abb12">SSRI</term><def><p>selective serotonin reuptake inhibitors</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>GBD 2019 Mental Disorders Collaborators</collab></person-group><article-title>Global, regional, and national burden of 12 mental disorders in 204 countries and territories, 1990&#x2013;2019: a systematic analysis for the Global Burden of Disease Study 2019</article-title><source>Lancet Psychiatry</source><year>2022</year><month>02</month><volume>9</volume><issue>2</issue><fpage>137</fpage><lpage>150</lpage><pub-id pub-id-type="doi">10.1016/S2215-0366(21)00395-3</pub-id><pub-id pub-id-type="medline">35026139</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>SW</given-names> </name><name name-style="western"><surname>Stewart</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Predictors of 12-week remission in a nationwide cohort of people with depressive disorders: the CRESCEND study</article-title><source>Hum Psychopharmacol</source><year>2011</year><month>01</month><volume>26</volume><issue>1</issue><fpage>41</fpage><lpage>50</lpage><pub-id pub-id-type="doi">10.1002/hup.1168</pub-id><pub-id pub-id-type="medline">21344501</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>HY</given-names> </name><name name-style="western"><surname>Jhon</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Prediction of 12-Week remission by psychopharmacological treatment step in patients with depressive disorders</article-title><source>Psychiatry Investig</source><year>2022</year><month>10</month><volume>19</volume><issue>10</issue><fpage>866</fpage><lpage>871</lpage><pub-id pub-id-type="doi">10.30773/pi.2022.0160</pub-id><pub-id pub-id-type="medline">36327967</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Walter</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Abright</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Bukstein</surname><given-names>OG</given-names> </name><etal/></person-group><article-title>Clinical practice guideline for the assessment and treatment of children and adolescents with major and persistent depressive disorders</article-title><source>J Am Acad Child Adolesc Psychiatry</source><year>2023</year><month>05</month><volume>62</volume><issue>5</issue><fpage>479</fpage><lpage>502</lpage><pub-id pub-id-type="doi">10.1016/j.jaac.2022.10.001</pub-id><pub-id pub-id-type="medline">36273673</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Perlman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Benrimoh</surname><given-names>D</given-names> </name><name name-style="western"><surname>Israel</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A systematic meta-review of predictors of antidepressant treatment outcome in major depressive disorder</article-title><source>J Affect Disord</source><year>2019</year><month>01</month><day>15</day><volume>243</volume><fpage>503</fpage><lpage>515</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2018.09.067</pub-id><pub-id pub-id-type="medline">30286415</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sharma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Barrett</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Cucchiara</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Gooneratne</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Thase</surname><given-names>ME</given-names> </name></person-group><article-title>A breathing-based meditation intervention for patients with major depressive disorder following inadequate response to antidepressants: a randomized pilot study</article-title><source>J Clin Psychiatry</source><year>2017</year><month>01</month><volume>78</volume><issue>1</issue><fpage>e59</fpage><lpage>e63</lpage><pub-id pub-id-type="doi">10.4088/JCP.16m10819</pub-id><pub-id pub-id-type="medline">27898207</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Benoit</surname><given-names>JRA</given-names> </name><name name-style="western"><surname>Dursun</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Greiner</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Using machine learning to predict remission in patients with major depressive disorder treated with desvenlafaxine</article-title><source>Can J Psychiatry</source><year>2022</year><month>01</month><volume>67</volume><issue>1</issue><fpage>39</fpage><lpage>47</lpage><pub-id pub-id-type="doi">10.1177/07067437211037141</pub-id><pub-id pub-id-type="medline">34379019</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salem</surname><given-names>H</given-names> </name><name name-style="western"><surname>Huynh</surname><given-names>T</given-names> </name><name name-style="western"><surname>Topolski</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Temporal multi-step predictive modeling of remission in major depressive disorder using early stage treatment data; STAR*D based machine learning approach</article-title><source>J Affect Disord</source><year>2023</year><month>03</month><day>1</day><volume>324</volume><fpage>286</fpage><lpage>293</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2022.12.076</pub-id><pub-id pub-id-type="medline">36584711</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carr</surname><given-names>E</given-names> </name><name name-style="western"><surname>Rietschel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mors</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Optimizing the prediction of depression remission: a longitudinal machine learning approach</article-title><source>Am J Med Genet B Neuropsychiatr Genet</source><year>2025</year><month>04</month><volume>198</volume><issue>3</issue><fpage>e33014</fpage><pub-id pub-id-type="doi">10.1002/ajmg.b.33014</pub-id><pub-id pub-id-type="medline">39470297</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhukovsky</surname><given-names>P</given-names> </name><name name-style="western"><surname>Trivedi</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Weissman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Parsey</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kennedy</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pizzagalli</surname><given-names>DA</given-names> </name></person-group><article-title>Generalizability of treatment outcome prediction across antidepressant treatment trials in depression</article-title><source>JAMA Netw Open</source><year>2025</year><month>03</month><day>3</day><volume>8</volume><issue>3</issue><fpage>e251310</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2025.1310</pub-id><pub-id pub-id-type="medline">40111362</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cheng</surname><given-names>SW</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>WJ</given-names> </name><etal/></person-group><article-title>The now and future of ChatGPT and GPT in psychiatry</article-title><source>Psychiatry Clin Neurosci</source><year>2023</year><month>11</month><volume>77</volume><issue>11</issue><fpage>592</fpage><lpage>596</lpage><pub-id pub-id-type="doi">10.1111/pcn.13588</pub-id><pub-id pub-id-type="medline">37612880</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jung</surname><given-names>W</given-names> </name></person-group><article-title>Using large language models to detect depression from user-generated diary text data as a novel approach in digital mental health screening: instrument validation study</article-title><source>J Med Internet Res</source><year>2024</year><month>09</month><day>18</day><volume>26</volume><fpage>e54617</fpage><pub-id pub-id-type="doi">10.2196/54617</pub-id><pub-id pub-id-type="medline">39292502</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Soffer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Charney</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Landi</surname><given-names>I</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Applications of large language models in psychiatry: a systematic review</article-title><source>Front Psychiatry</source><year>2024</year><volume>15</volume><fpage>1422807</fpage><pub-id pub-id-type="doi">10.3389/fpsyt.2024.1422807</pub-id><pub-id pub-id-type="medline">38979501</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><article-title>Introducing openai o1-preview</article-title><source>OpenAI</source><year>2024</year><access-date>2025-12-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/research/introducing-openai-o1-preview">https://openai.com/research/introducing-openai-o1-preview</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><article-title>OpenAI o3-mini: pushing the frontier of cost-effective reasoning</article-title><source>OpenAI</source><year>2025</year><access-date>2025-12-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/research/openai-o3-mini">https://openai.com/research/openai-o3-mini</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Claude's extended thinking</article-title><source>Anthropic</source><year>2025</year><access-date>2025-12-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/news/visible-extended-thinking">https://www.anthropic.com/news/visible-extended-thinking</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zong</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A preliminary study of o1 in medicine: are we closer to an ai doctor?</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 23, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2409.15277</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Huatuogpt-o1, towards medical complex reasoning with llms</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.18925</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Mondillo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Colosimo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Perrotta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Frattolillo</surname><given-names>V</given-names> </name><name name-style="western"><surname>Masino</surname><given-names>M</given-names> </name></person-group><article-title>Comparative evaluation of advanced AI reasoning models in pediatric clinical decision support: chatgpt O1 vs. deepseek-r1</article-title><source>medRxiv</source><comment>Preprint posted online on  Jan 27, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.01.27.25321169</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Mondillo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Masino</surname><given-names>M</given-names> </name><name name-style="western"><surname>Colosimo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Perrotta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Frattolillo</surname><given-names>V</given-names> </name></person-group><article-title>Evaluating AI reasoning models in pediatric medicine: a comparative analysis of o3-mini and o3-mini-high</article-title><source>medRxiv</source><comment>Preprint posted online on  Feb 27, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.02.27.25323028</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>K</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>He</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>D</given-names> </name></person-group><article-title>DeepSeek-R1 outperforms Gemini 2.0 Pro, OpenAI o1, and o3-mini in bilingual complex ophthalmology reasoning</article-title><source>Adv Ophthalmol Pract Res</source><year>2025</year><volume>5</volume><issue>3</issue><fpage>189</fpage><lpage>195</lpage><pub-id pub-id-type="doi">10.1016/j.aopr.2025.05.001</pub-id><pub-id pub-id-type="medline">40678192</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kang</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>SY</given-names> </name><etal/></person-group><article-title>The MAKE biomarker discovery for enhancing antidepressant treatment effect and response (MAKE BETTER) study: design and methodology</article-title><source>Psychiatry Investig</source><year>2018</year><month>05</month><volume>15</volume><issue>5</issue><fpage>538</fpage><lpage>545</lpage><pub-id pub-id-type="doi">10.30773/pi.2017.10.2</pub-id><pub-id pub-id-type="medline">29614851</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sheehan</surname><given-names>DV</given-names> </name><name name-style="western"><surname>Lecrubier</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sheehan</surname><given-names>KH</given-names> </name><etal/></person-group><article-title>The MINI-International Neuropsychiatric Interview (M.I.N.I.): the development and validation of a structured diagnostic psychiatric interview for DSM-IV and ICD-10</article-title><source>J Clin Psychiatry</source><year>1998</year><volume>59 Suppl 20</volume><fpage>22</fpage><lpage>33</lpage><pub-id pub-id-type="medline">9881538</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Overall</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Gorham</surname><given-names>DR</given-names> </name></person-group><article-title>The brief psychiatric rating scale</article-title><source>Psychol Rep</source><year>1962</year><month>06</month><volume>10</volume><issue>3</issue><fpage>799</fpage><lpage>812</lpage><pub-id pub-id-type="doi">10.2466/pr0.1962.10.3.799</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>HAMILTON</surname><given-names>M</given-names> </name></person-group><article-title>A rating scale for depression</article-title><source>J Neurol Neurosurg Psychiatry</source><year>1960</year><month>02</month><volume>23</volume><issue>1</issue><fpage>56</fpage><lpage>62</lpage><pub-id pub-id-type="doi">10.1136/jnnp.23.1.56</pub-id><pub-id pub-id-type="medline">14399272</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rabin</surname><given-names>R</given-names> </name><name name-style="western"><surname>de Charro</surname><given-names>F</given-names> </name></person-group><article-title>EQ-5D: a measure of health status from the EuroQol Group</article-title><source>Ann Med</source><year>2001</year><month>07</month><volume>33</volume><issue>5</issue><fpage>337</fpage><lpage>343</lpage><pub-id pub-id-type="doi">10.3109/07853890109002087</pub-id><pub-id pub-id-type="medline">11491192</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Sheehan</surname><given-names>DV</given-names> </name></person-group><source>The Anxiety Disease</source><year>1983</year><publisher-name>Charles Scribner&#x2019;s Sons</publisher-name><fpage>144</fpage><lpage>153</lpage><pub-id pub-id-type="other">9780684180472</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kamarck</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mermelstein</surname><given-names>R</given-names> </name></person-group><article-title>A global measure of perceived stress</article-title><source>J Health Soc Behav</source><year>1983</year><month>12</month><volume>24</volume><issue>4</issue><fpage>385</fpage><lpage>396</lpage><pub-id pub-id-type="doi">10.2307/2136404</pub-id><pub-id pub-id-type="medline">6668417</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Connor</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Davidson</surname><given-names>JRT</given-names> </name></person-group><article-title>Development of a new resilience scale: the Connor-Davidson Resilience Scale (CD-RISC)</article-title><source>Depress Anxiety</source><year>2003</year><volume>18</volume><issue>2</issue><fpage>76</fpage><lpage>82</lpage><pub-id pub-id-type="doi">10.1002/da.10113</pub-id><pub-id pub-id-type="medline">12964174</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zimet</surname><given-names>GD</given-names> </name><name name-style="western"><surname>Dahlem</surname><given-names>NW</given-names> </name><name name-style="western"><surname>Zimet</surname><given-names>SG</given-names> </name><name name-style="western"><surname>Farley</surname><given-names>GK</given-names> </name></person-group><article-title>The multidimensional scale of perceived social support</article-title><source>J Pers Assess</source><year>1988</year><month>03</month><volume>52</volume><issue>1</issue><fpage>30</fpage><lpage>41</lpage><pub-id pub-id-type="doi">10.1207/s15327752jpa5201_2</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kojima</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gu</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Reid</surname><given-names>M</given-names> </name><name name-style="western"><surname>Matsuo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Iwasawa</surname><given-names>Y</given-names> </name></person-group><article-title>Large language models are zero-shot reasoners</article-title><year>2022</year><access-date>2025-12-23</access-date><conf-name>NIPS&#x2019;22: Proceedings of the 36th International Conference on Neural Information Processing Systems</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><conf-loc>New Orleans, Louisiana, USA</conf-loc><fpage>22199</fpage><lpage>22213</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/10.5555/3600270.3601883">https://dl.acm.org/doi/10.5555/3600270.3601883</ext-link></comment><pub-id pub-id-type="doi">10.5555/3600270.3601883</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Teng</surname><given-names>F</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Y</given-names> </name></person-group><article-title>Atom of thoughts for markov llm test-time scaling</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 17, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.12018</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>Introducing deep research</article-title><source>OpenAI</source><year>2025</year><access-date>2025-12-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/introducing-deep-research/">https://openai.com/index/introducing-deep-research/</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kwon</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>KT</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Large language models are clinical reasoners: reasoning-aware diagnosis framework with prompt-generated rationales</article-title><year>2024</year><access-date>2025-12-23</access-date><conf-name>Proceedings of the AAAI Conference on Artificial Intelligence</conf-name><conf-date>Feb 20-27, 2024</conf-date><conf-loc>Vancouver, Canada</conf-loc><fpage>18417</fpage><lpage>18425</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://ojs.aaai.org/index.php/AAAI/article/view/29802">https://ojs.aaai.org/index.php/AAAI/article/view/29802</ext-link></comment><pub-id pub-id-type="doi">10.1609/aaai.v38i16.29802</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wallert</surname><given-names>J</given-names> </name><name name-style="western"><surname>Boberg</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kaldo</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Predicting remission after internet-delivered psychotherapy in patients with depression using machine learning and multi-modal data</article-title><source>Transl Psychiatry</source><year>2022</year><month>09</month><day>1</day><volume>12</volume><issue>1</issue><fpage>357</fpage><pub-id pub-id-type="doi">10.1038/s41398-022-02133-3</pub-id><pub-id pub-id-type="medline">36050305</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>LoParo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Dunlop</surname><given-names>BW</given-names> </name><name name-style="western"><surname>Nemeroff</surname><given-names>CB</given-names> </name><name name-style="western"><surname>Mayberg</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Craighead</surname><given-names>WE</given-names> </name></person-group><article-title>Prediction of individual patient outcomes to psychotherapy vs medication for major depression</article-title><source>Npj Ment Health Res</source><year>2025</year><month>02</month><day>5</day><volume>4</volume><issue>1</issue><fpage>4</fpage><pub-id pub-id-type="doi">10.1038/s44184-025-00119-9</pub-id><pub-id pub-id-type="medline">39910171</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omiye</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Lester</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Spichak</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rotemberg</surname><given-names>V</given-names> </name><name name-style="western"><surname>Daneshjou</surname><given-names>R</given-names> </name></person-group><article-title>Large language models propagate race-based medicine</article-title><source>NPJ Digit Med</source><year>2023</year><month>10</month><day>20</day><volume>6</volume><issue>1</issue><fpage>195</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00939-z</pub-id><pub-id pub-id-type="medline">37864012</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zakka</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chaurasia</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Almanac - retrieval-augmented language models for clinical medicine</article-title><source>NEJM AI</source><year>2024</year><month>02</month><volume>1</volume><issue>2</issue><fpage>AIoa2300068</fpage><pub-id pub-id-type="doi">10.1056/aioa2300068</pub-id><pub-id pub-id-type="medline">38343631</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Borrell-Carri&#x00F3;</surname><given-names>F</given-names> </name><name name-style="western"><surname>Suchman</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Epstein</surname><given-names>RM</given-names> </name></person-group><article-title>The biopsychosocial model 25 years later: principles, practice, and scientific inquiry</article-title><source>Ann Fam Med</source><year>2004</year><volume>2</volume><issue>6</issue><fpage>576</fpage><lpage>582</lpage><pub-id pub-id-type="doi">10.1370/afm.245</pub-id><pub-id pub-id-type="medline">15576544</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>High-performance medicine: the convergence of human and artificial intelligence</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>44</fpage><lpage>56</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0300-7</pub-id><pub-id pub-id-type="medline">30617339</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wiens</surname><given-names>J</given-names> </name><name name-style="western"><surname>Saria</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sendak</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Do no harm: a roadmap for responsible machine learning for health care</article-title><source>Nat Med</source><year>2019</year><month>09</month><volume>25</volume><issue>9</issue><fpage>1337</fpage><lpage>1340</lpage><pub-id pub-id-type="doi">10.1038/s41591-019-0548-6</pub-id><pub-id pub-id-type="medline">31427808</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beam</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Manrai</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Ghassemi</surname><given-names>M</given-names> </name></person-group><article-title>Challenges to the reproducibility of machine learning models in health care</article-title><source>JAMA</source><year>2020</year><month>01</month><day>28</day><volume>323</volume><issue>4</issue><fpage>305</fpage><lpage>306</lpage><pub-id pub-id-type="doi">10.1001/jama.2019.20866</pub-id><pub-id pub-id-type="medline">31904799</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Obradovich</surname><given-names>N</given-names> </name><name name-style="western"><surname>Khalsa</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Opportunities and risks of large language models in psychiatry</article-title><source>NPP Digit Psychiatry Neurosci</source><year>2024</year><volume>2</volume><issue>1</issue><fpage>8</fpage><pub-id pub-id-type="doi">10.1038/s44277-024-00010-z</pub-id><pub-id pub-id-type="medline">39554888</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bzdok</surname><given-names>D</given-names> </name><name name-style="western"><surname>Meyer-Lindenberg</surname><given-names>A</given-names> </name></person-group><article-title>Machine learning for precision psychiatry: opportunities and challenges</article-title><source>Biol Psychiatry Cogn Neurosci Neuroimaging</source><year>2018</year><month>03</month><volume>3</volume><issue>3</issue><fpage>223</fpage><lpage>230</lpage><pub-id pub-id-type="doi">10.1016/j.bpsc.2017.11.007</pub-id><pub-id pub-id-type="medline">29486863</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary materials on the MAKE BETTER study.</p><media xlink:href="mental_v13i1e83352_app1.docx" xlink:title="DOCX File, 26 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Structure of the atom-of-thoughts (AoT) prompt.</p><media xlink:href="mental_v13i1e83352_app2.docx" xlink:title="DOCX File, 409 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Confusion matrices for each zero-shot prompting under varying reasoning levels or token budgets.</p><media xlink:href="mental_v13i1e83352_app3.docx" xlink:title="DOCX File, 294 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Confusion matrices for advanced prompting strategies.</p><media xlink:href="mental_v13i1e83352_app4.docx" xlink:title="DOCX File, 108 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Predictive performance of machine learning models for 12-week remission classification.</p><media xlink:href="mental_v13i1e83352_app5.docx" xlink:title="DOCX File, 19 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Representative remission (&#x201C;yes&#x201D;) case generated by the RoD prompting strategy.</p><media xlink:href="mental_v13i1e83352_app6.docx" xlink:title="DOCX File, 1046 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Representative remission (&#x201C;no&#x201D;) case generated by the RoD prompting strategy.</p><media xlink:href="mental_v13i1e83352_app7.docx" xlink:title="DOCX File, 1417 KB"/></supplementary-material></app-group></back></article>