<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Ment Health</journal-id><journal-id journal-id-type="publisher-id">mental</journal-id><journal-id journal-id-type="index">16</journal-id><journal-title>JMIR Mental Health</journal-title><abbrev-journal-title>JMIR Ment Health</abbrev-journal-title><issn pub-type="epub">2368-7959</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e90581</article-id><article-id pub-id-type="doi">10.2196/90581</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Predicting Momentary Suicidal Ideation From Smartphone Screenshots Using Vision-Language Models: Prospective Machine Learning Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Jacobucci</surname><given-names>Ross</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shao</surname><given-names>Wenpei</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kobrinsky</surname><given-names>Veronika</given-names></name><degrees>MA</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ammerman</surname><given-names>Brooke</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Center for Healthy Minds, University of Wisconsin&#x2013;Madison</institution><addr-line>625 W Washington Ave</addr-line><addr-line>Madison</addr-line><addr-line>WI</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Psychology, University of Wisconsin&#x2013;Madison</institution><addr-line>Madison</addr-line><addr-line>WI</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Torous</surname><given-names>John</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lei</surname><given-names>Chang</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Bloom</surname><given-names>Paul</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Ross Jacobucci, PhD, Center for Healthy Minds, University of Wisconsin&#x2013;Madison, 625 W Washington Ave, Madison, WI, 53703, United States, 1 (608) 263-6321; <email>jacobucci@wisc.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>8</day><month>4</month><year>2026</year></pub-date><volume>13</volume><elocation-id>e90581</elocation-id><history><date date-type="received"><day>30</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>23</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>24</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Ross Jacobucci, Wenpei Shao, Veronika Kobrinsky, Brooke Ammerman. Originally published in JMIR Mental Health (<ext-link ext-link-type="uri" xlink:href="https://mental.jmir.org">https://mental.jmir.org</ext-link>), 8.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Mental Health, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mental.jmir.org/">https://mental.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mental.jmir.org/2026/1/e90581"/><abstract><sec><title>Background</title><p>Passive smartphone sensing shows promise for suicide prevention, but behavioral metadata (GPS, screen time, and accelerometry) often lacks the contextual information needed to detect acute psychological distress. Analyzing what people actually see, read, and type on their phones&#x2014;rather than just usage patterns&#x2014;may provide more proximal signals of risk.</p></sec><sec><title>Objective</title><p>This study aimed to test whether vision-language models (VLMs) applied to passively captured smartphone screenshots can predict momentary suicidal ideation (SI).</p></sec><sec sec-type="methods"><title>Methods</title><p>Seventy-nine adults with past month suicidal thoughts or behaviors completed ecological momentary assessments (EMA) over 28 days while screenshots were captured every 5 seconds during active phone use. We fine-tuned open-source VLMs (Qwen2.5-VL [Alibaba Cloud], LFM2-VL [Liquid AI]), and text-only models (Qwen3 [Alibaba Cloud]) to predict SI from screenshots captured in the 2 hours preceding each EMA. We evaluated performance with temporal and subject holdouts.</p></sec><sec sec-type="results"><title>Results</title><p>The analytic sample comprised 2.5 million screenshots from 70 participants. Temporal holdout models achieved strong discrimination at the EMA level (AUC=0.83; AUPRC=0.77), with image-based models outperforming text-only models (AUC=0.83 vs 0.79; 95% CI 0.003-0.07). Subject holdout generalization was near chance (AUC&#x2248;0.50), though a simple lexical screening method retained modest discrimination (AUC=0.62). Smaller models performed comparably to larger models, supporting feasible on-device deployment.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Screen content predicts short-term SI with clinically meaningful accuracy when models are personalized but does not generalize across individuals. These findings support a 2-stage clinical architecture, coarse lexical screening for new patients, with personalized VLM-based monitoring after a calibration period. On-device inference may enable privacy-preserving deployment.</p></sec></abstract><kwd-group><kwd>digital phenotyping</kwd><kwd>suicide</kwd><kwd>passive sensing</kwd><kwd>phone use</kwd><kwd>smartphone</kwd><kwd>foundation models</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Digital mental health increasingly aims to detect short-term changes in risk and deliver support precisely when needed via just-in-time adaptive interventions (JITAIs) [<xref ref-type="bibr" rid="ref1">1</xref>]. Continuous, unobtrusive data streams from smartphones and wearables can provide objective, high-frequency indicators of psychological states, including mobility, communication, sleep, heart rate variability, and device usage. These passive sensing modalities are well-suited to the hour-to-hour fluctuations characteristic of suicidal ideation (SI) [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. In suicide-focused passive sensing, feasibility has been demonstrated in acute settings using phone-derived measures [<xref ref-type="bibr" rid="ref4">4</xref>], with additional evidence that combining ecological momentary assessment (EMA) with passive data improves short-horizon prediction of SI [<xref ref-type="bibr" rid="ref3">3</xref>] and that intensive longitudinal modeling can track SI trajectories over time in demanding real-world contexts [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Over the past few years, the literature has expanded across sensing modalities, analytic approaches, and clinical settings. Work includes GPS-based detection of risk among high-risk adolescents [<xref ref-type="bibr" rid="ref7">7</xref>], noninvasive speech analysis in emergency care [<xref ref-type="bibr" rid="ref8">8</xref>], physiological signals from wearables for imminent risk identification [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref12">12</xref>], and smartphone-based monitoring over weekly horizons [<xref ref-type="bibr" rid="ref13">13</xref>]. Methodological advances span transformer-based emotion forecasting [<xref ref-type="bibr" rid="ref14">14</xref>] and multimodal, contrastive pipelines that fuse active and passive signals [<xref ref-type="bibr" rid="ref15">15</xref>]. Recent studies from our group further suggest that on-screen text and content collected passively may relate directly to suicide risk, complementing traditional sensors [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. Taken together, systematic reviews conclude that suicide-focused passive sensing is an active area with translational promise, but that rigorous prediction remains underrepresented relative to descriptive or in-sample analyses [<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>A central challenge of many passive sensing approaches is that commonly used behavioral metadata (eg, GPA, screen time, and app counts) can be distal from the psychological mechanisms that researchers and clinicians need to understand and intervene upon to prevent suicide in real-time [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref20">20</xref>], meaning embedded in digital interactions, not only linguistic content, but also visual context, app interfaces, and media, as distinct from traditional passive sensing, which captures the mechanics of behavior (metadata, movement, and usage duration; see <xref ref-type="fig" rid="figure1">Figure 1A-1C</xref>). Importantly, semantic sensing exists on a continuum rather than being synonymous with any single data stream.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Conceptual overview of screen content analysis versus conventional passive sensing. (A) Conventional passive sensing captures behavioral metadata (GPS, movement, screen time, and heart rate). (B) Screen content analysis uses vision-language models to extract semantic content from screenshots. (C) Signal proximity to psychological state increases from distal behavioral metadata to proximal screen content.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mental_v13i1e90581_fig01.png"/></fig><p>One increasingly used intermediate semantic modality is passively logged keyboard text input, which captures self-generated language (ie, what people type) during naturalistic smartphone use. Keyboard sensing has been used to extract clinically relevant linguistic markers from everyday communication (eg, pronoun use, sentiment, and other language features) and to link typed language and keystroke-related features to mental health symptoms over time [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>]. Emerging work in adolescents also demonstrates the feasibility of detecting suicide-related language in passively collected keyboard entries using youth-tailored lexicons, highlighting the promise of semantic signals [<xref ref-type="bibr" rid="ref24">24</xref>]. Although keyboard logging meaningfully moves beyond purely mechanical signals, it provides only a partial view of the person&#x2019;s digital environments by reflecting primarily outgoing content and generally does not capture incoming messages, viewed text, or graphical/user interface context that may shape meaning.</p><p>In contrast, screenomics, referring to the high-frequency capture of smartphone screenshots every 5 seconds while in use, offers a richer semantic representation by incorporating incoming and outgoing communication, consumed content (ie, what a person reads/watches/is exposed to), and the visual and structural context of that content [<xref ref-type="bibr" rid="ref25">25</xref>], aligned with a more fine-grained semantic sensing approach. Such access to broader digital environments may be especially important for detecting acute SI because risk-relevant signals may be contextual rather than explicitly self-disclosed [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. We therefore conceptualize screenshots as capturing the broadest semantic substrate of mobile experiences while recognizing keyboard sensing as a narrower, yet meaningful, semantic channel within the passive sensing literature.</p><p>Several findings from the literature inform our approach to applying screenomics to the detection of suicide risk. Temporal proximity matters; suicide-related outcomes are most predictable over short windows [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Integrating modalities also tends to help, though gains from adding active self-report to passive signals [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref15">15</xref>] may partly reflect shared method variance when EMA items serve as both input and outcome [<xref ref-type="bibr" rid="ref26">26</xref>]. We therefore focus on purely passive prediction to isolate the unique contribution of semantic information embedded in everyday digital environments.</p><p>Evaluation strategy poses an additional challenge in semantic sensing and digital phenotyping. Digital behaviors have idiosyncratic meanings; between-person patterns need not reflect within-person dynamics [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref27">27</xref>], as evidenced by the common finding that within-person (temporal holdout) prediction performance exceeds between-person (subject holdout) prediction [<xref ref-type="bibr" rid="ref28">28</xref>]. This presents a critical evaluation challenge. While person-specific baselines may offer superior predictive power, the intended clinical use case determines the appropriate evaluation strategy. Models intended for monitoring individuals over time require temporal holdouts within a person to assess temporal generalization, for example, a calibration period in a JITAI [<xref ref-type="bibr" rid="ref29">29</xref>] that learns participant-specific behavior patterns before implementing interventions in a testing phase. In contrast, models intended for screening new patients require person-level splits to assess generalization across individuals [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. Critically, both approaches require evaluation strategies aligned with their intended use case; for instance, performance estimates can be substantially inflated when temporal and between-person sources of variance are conflated within train/test splits [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref32">32</xref>].</p></sec><sec id="s1-2"><title>Current Study</title><p>We test whether high-resolution smartphone screenshots, capturing what people actually read, watch, and type, can support moment-to-moment prediction of SI, which will ultimately support JITAI-style deployment. We leverage open-source foundation models (eg, Qwen2.5-VL [Alibaba Cloud], LFM2-VL [Liquid AI], and Qwen3 [Alibaba Cloud]) for images and text to learn representations directly from screenshot pixels and extracted text, rather than hand-crafted features. We compare 2 approaches, vision-language models (VLMs) that process screenshots as images, capturing layout, app context, and visual content, versus text-only models that operate on Optical Character Recognition (OCR)-extracted text alone. To align the signal with clinical decision points, we target the 2 hours preceding each EMA, a window designed to capture rapid risk fluctuations that are actionable, in real time.</p><p>Because clinical deployment faces 2 distinct generalization problems, we frame evaluation around 2 use cases. First, within-person risk detection aims to learn a person&#x2019;s baseline and detect departures from it, ultimately serving to support personalized JITAIs. Second, across-subject generalization asks whether models trained on one cohort can provide useful risk estimates for entirely new individuals, as found in screening approaches. The former prioritizes temporal transfer within a person; the latter emphasizes robustness across heterogeneous digital habits. We also compare granularity&#x2014;single-screenshot predictions versus brief, EMA-level summaries&#x2014;testing whether short-horizon temporal context improves performance.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Sample</title><p>The sample consisted of 79 adults who reported past-month suicidal thoughts or behaviors, owned an Android-based smartphone, and demonstrated variability on the variables of interest during 28 days of data collection. Of the 79 enrolled participants, 70 were retained in the analytic sample, 3 were excluded due to no screenshots captured, and 6 due to complete nonresponse to EMA assessments. Participants were recruited via social media advertisements and community flyers from a midsized city in the Midwest. Participants ranged in age from 20 to 63 years (mean 35.15, SD 11.07); 68.% (54/79) were female; 84.8% (67/79) identified as White, 6.3% (5/79) as Black, 6.3% (5/79) as American Indian/Alaska Native, and 2.5% (2/79) as another race; and 92.4% (73/79) identified as non-Hispanic/Latino. Most participants (83.8%; 66/79) met diagnostic criteria for at least one psychiatric disorder, and 64.9% (51/79) met current diagnostic criteria for 2 or more disorders (mean 2.55, SD 1.86). Further, 72.2% (57/79) had a lifetime history of a suicide plan (38% (30/79) in the past year), and 64.6% (51/79) had a lifetime history of a suicide attempt (12.7% (10/79) in the past year). Two participants self-presented for psychiatric hospitalization for suicidal crises during the study period; no participants reported engaging in a suicide attempt.</p><p>Screenshots were automatically captured every 5 seconds during active phone use via the ScreenLife Capture app (University of Washington) [<xref ref-type="bibr" rid="ref33">33</xref>], with modifications made by the study team. The study captured 7,501,670 screenshots in total, with participants averaging 92,613 screenshots each (median 85,795; IQR 20,469-130,888) across the study period. During the informed consent process, participants were fully informed of all data collection procedures (ie, screenshot capture every 5 seconds their phone was in use), including the possibility of capture of sensitive data throughout the study period, data storage procedures, and steps taken to help protect participant confidentiality (ie, limited viewing of raw screenshot files). Limits of confidentiality were also discussed, including circumstances during which the research staff was concerned for their immediate safety. All participants received resources at study enrollment. If nonzero active SI was reported via EMA, an automated pop-up of crisis resources was provided. If high levels of active SI (ie, &#x003E;4 out of 5) were reported, the study team reached out to conduct a comprehensive risk assessment.</p></sec><sec id="s2-2"><title>Ecological Momentary Assessment</title><p>Participants received 6 signal-contingent EMA prompts per day (ie, randomized within 2-hour windows across a 12-hour block) via the LifeData (LifeData LLC) app, each taking 3&#x2010;4 minutes to complete. The overall EMA compliance rate was 68.8%, with an average of 3.4 hours elapsed between EMA surveys. Two questions assessed momentary (ie, &#x201C;At this moment&#x2026;&#x201D;), active SI (ie, &#x201C;I think about taking my life;&#x201D; &#x201C;I want to die&#x201D;), which have been previously validated [<xref ref-type="bibr" rid="ref34">34</xref>]. These items were answered on a 5-point Likert scale (1=not at all to 5=very much) and summed to create a composite score (range: 2&#x2010;10). The continuous SI composite exhibited pronounced floor effects, with 63%&#x2010;69% of EMAs having scores of 2 (both items at minimum), and only 10%&#x2010;15% scoring 5 or above (indicating at least moderate endorsement on one item). We therefore dichotomized the outcome as 0=no SI (composite =2) and 1=SI present (composite &#x003E;2; positive class for AUPRC). In the temporal holdout split, SI was present in 36.5% of train EMAs (1086/2974) and 33.9% of test EMAs (381/1123). In the subject holdout split, SI was present in 40.0% of train EMAs (871/2177) and 31.0% of test EMAs (582/1880).</p></sec><sec id="s2-3"><title>Data Subsetting</title><p>For the purposes of this study, we removed screenshots that occurred more than 2 hours prior to the EMA assessment. The resultant dataset comprised 2,554,692 screenshot observations collected from 70 unique participants over the 28-day study period, with substantial variability in data contribution across individuals (mean 36,495.60 screenshots per participant, SD 26,999.18; range 7&#x2010;129,680). Compliance was 68.11% for active SI, and text extracted from screenshots was present in 99.94% of observations (2,553,240 rows), with only 1452 instances containing no detectable text. The extracted text varied considerably in length (mean 224.13 characters, SD=194.26; range=0&#x2010;2491), reflecting the diverse nature of smartphone screen content captured during naturalistic use, from brief notifications to lengthy articles or messages. All extracted text was retained without filtering, given the models&#x2019; ability to handle variable input lengths and the challenge of defining noise in naturalistic smartphone data.</p></sec><sec id="s2-4"><title>Performance Evaluation</title><p>Our intended applications are (1) within-person personalized risk detection and (2) across-subject generalization. These require different evaluation strategies [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref32">32</xref>]: temporal holdouts assess whether person-specific patterns persist over time (relevant for personalized monitoring), while subject holdouts assess generalization to new individuals (relevant for screening). Performance estimates can be inflated by 100%&#x2010;300% when the evaluation strategy is misaligned with the intended use case [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. We report only out-of-sample predictions under both strategies.</p></sec><sec id="s2-5"><title>Train/Test Partitioning</title><p>For temporal holdout validation, we trained on the first 70% of each participant&#x2019;s EMA assessments and tested on the final 30%, simulating prospective deployment and avoiding interpolation across randomly intermixed time points [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. For subject holdout validation, we performed a 50/50 split by participant ID, keeping all observations from a given person in a single fold to test performance on entirely new individuals and to respect ergodicity concerns [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. More elaborate schemes (eg, blocked CV with gaps, leave-one-subject-out, constrained repeated random sampling-cross validation, and forward-chaining) are valuable [<xref ref-type="bibr" rid="ref40">40</xref>-<xref ref-type="bibr" rid="ref43">43</xref>] but were computationally prohibitive with foundation models.</p></sec><sec id="s2-6"><title>Prediction Targets, Labeling, and Aggregation</title><p>To couple signals to clinically actionable horizons, we restricted inputs to screenshots captured within the 2-hour window prior to each EMA [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Screenshots in this window inherited the label from the immediately following EMA. That is, for an EMA at time &#x201C;t<italic>&#x201D;</italic>, all screenshots from (<italic>t</italic>&#x2212;2h, <italic>t</italic>) were labeled with that EMA&#x2019;s SI response. Screenshots occurring after an EMA were never used to predict that assessment, ensuring strictly prospective prediction (see <xref ref-type="fig" rid="figure2">Figure 2</xref> for an overview of the data design). We evaluated two granularities: (1) screenshot-level predictions and (2) EMA-level predictions that aggregate information across screenshots in a window. For EMA-level models, we computed summary features commonly used in short-horizon risk detection (eg, mean, max, SD, range, 75th-90th percentiles, proportion&#x2265;5 and &#x2265;7 risk, skewness, and kurtosis). This design tests whether brief temporal context improves discrimination relative to single-frame decisions while maintaining strict separation between training and test periods/participants. Of note, we tested models that aggregated the data prior to training (eg, concatenating the text; subsampling images); however, none of these models evidenced an above-average change in predictive performance, likely due to limitations with the token input constraints.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Data structure and prediction aggregation. Screenshots within the 2-hour window preceding each ecological momentary assessment (EMA) inherit that assessment&#x2019;s suicidal ideation label. For example, screenshots captured at 11:32, 12:04, and 12:49 are labeled with the 1:00 PM EMA response (SI&#x2081;:&#x2080;&#x2080;) and aggregated to predict that assessment. Screenshot-level predictions (p&#x0302;) are generated for each image; EMA-level predictions (&#x03BC;p&#x0302;) aggregate these via summary statistics. EMA: ecological momentary assessment.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mental_v13i1e90581_fig02.png"/></fig><p>These choices reflect a theoretical stance as much as a technical one: temporal holdouts split the probe to see whether person-specific digital patterns persist over time and are therefore suitable for personalized JITAIs; subject holdouts ask whether learned indicators transfer across heterogeneous digital habits&#x2014;a harder, but necessary, condition for broader screening [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. By using deployment-aligned evaluation strategies, we aim to provide performance estimates that are credible for real-world deployment in digital medicine [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref32">32</xref>].</p></sec><sec id="s2-7"><title>Evaluation Metrics</title><p>We report area under the receiver operating characteristic curve (AUC) and area under the precision-recall curve (AUPRC) as threshold-independent measures of discrimination on held-out data; AUPRC is especially informative under class imbalance because it emphasizes performance on the positive class [<xref ref-type="bibr" rid="ref45">45</xref>-<xref ref-type="bibr" rid="ref47">47</xref>]. Because our goal was to assess ranking ability rather than commit to an operational alert threshold, we did not report single-threshold metrics (eg, precision, recall, <italic>F</italic><sub>1</sub>-score, and accuracy), which depend on a chosen decision rule and deployment prevalence [<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref47">47</xref>].</p><p>To account for clustering of observations within participants, we computed 95% CIs using participant-level bootstrap resampling (1000 iterations). In each iteration, participants were sampled with replacement, and all observations from each sampled participant were included. AUC was computed on the resampled data, and percentile-based confidence intervals were derived from the bootstrap distribution. For model comparisons, we computed &#x0394;AUC on paired bootstrap samples to obtain confidence intervals for the difference in discrimination between models.</p><p>Because pooled AUC computed across all participants can conflate within-person predictive accuracy with between-person differences in baseline risk [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref39">39</xref>], we conducted supplemental analyses to decompose these sources of discrimination. First, we computed person-level AUC for each participant who had both SI-present and SI-absent EMAs in the temporal holdout test set and reported the distribution of these values. Second, we assessed the contribution of between-person discrimination by replacing each observation&#x2019;s predicted probability with that participant&#x2019;s mean predicted probability across all test EMAs (effectively reducing predictions to a person-specific intercept) and computing AUC on these intercept-only predictions. The difference between the full and between-person-only AUC indexes the incremental contribution of within-person temporal variation in predictions.</p></sec><sec id="s2-8"><title>Text and Visual Feature Extraction</title><p>EasyOCR (Jaided AI [<xref ref-type="bibr" rid="ref48">48</xref>]), a deep learning-based optical character recognition system, was applied to each screenshot to extract text content. The Python (Python Software Foundation) implementation of EasyOCR uses a combination of text detection and recognition models to identify and transcribe text regions from images. Across all extracted text blocks, EasyOCR confidence scores were highly variable (median 0.66, IQR 0.27&#x2010;0.93), reflecting heterogeneity in extraction quality across screenshot types. During manual inspection, we observed that text blocks with lower OCR confidence scores often contained inserted special characters rather than complete misreadings, making dictionary matching possible regardless of confidence thresholds. Therefore, we chose not to exclude any text blocks based on EasyOCR confidence scores.</p><p>Additionally, we applied Florence-2 (base version) [<xref ref-type="bibr" rid="ref49">49</xref>], a VLM pretrained on diverse visual and textual tasks. For each screenshot, the system performed five parallel extraction tasks: (1) optical character recognition to extract all visible text content, (2) detailed visual captioning to describe screen elements and layout, (3) condensed content analysis, (4) application identification through direct model querying (&#x201C;Which mobile application is shown in this screenshot?&#x201D;), and (5) social engagement feature detection with interaction cues, call-to-action elements, and engagement features. Application identification combined both heuristic text matching against extracted OCR content (searching for app-specific keywords like &#x201C;Facebook,&#x201D; &#x201C;Instagram,&#x201D; and &#x201C;WhatsApp&#x201D;) and the model&#x2019;s direct predictions.</p></sec><sec id="s2-9"><title>Zero-Shot Learning Analysis</title><p>To evaluate prediction without person-specific calibration, we applied zero-shot models that used no training data from this sample, relying entirely on pretrained weights. Risk scores were extracted from Llama 3.2 11B Vision-Instruct (Meta Inc; see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) using regular expression pattern matching to identify numeric values (0&#x2010;10) at the end of each response. Missing EMA scores (n=3543) were excluded from analysis. Extraction failures were rare (0.05% of observations), did not predict SI (AUC=0.50), and results were robust to imputation strategy. Additionally, to evaluate zero-shot transferability of existing mental health&#x2013;specific models, we applied Mental-FLAN-T5-XXL [<xref ref-type="bibr" rid="ref50">50</xref>], an 11-billion parameter model instruction-finetuned on multiple Reddit (Reddit Inc)-based mental health datasets, including suicide ideation classification (SDCNL), depression severity (DepSeverity), and stress detection (Dreaddit). This model was selected because it previously outperformed GPT-3.5 and GPT-4 on in-domain mental health classification tasks, representing a strong prior for text-based risk assessment. We used a classification-based prompt asking the model to categorize risk as &#x201C;none, low, moderate, high, or severe&#x201D; based on the OCR-extracted text; categorical outputs were mapped to ordinal probability scores (0.0&#x2010;1.0) for analysis.</p></sec><sec id="s2-10"><title>Fine-Tuning</title><p>All models were trained to predict a binary indicator of active SI (composite EMA score &#x003E;2) using binary cross-entropy loss with logits. No class weighting or resampling was applied; instead, performance was evaluated using threshold-independent metrics (AUC and AUPRC), which are robust to class imbalance. All models were implemented in PyTorch (Meta Inc) using HuggingFace Transformers (Hugging Face) and trained on NVIDIA L40S GPUs (48GB VRAM; NVIDIA Corp) using distributed data parallelism where applicable. Complete prompts are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-11"><title>Qwen2.5-VL (Vision-Language Model)</title><p>Qwen2.5-VL adds native dynamic-resolution processing and absolute time encoding for long-video understanding while preserving strong text generation. A redesigned ViT with windowed attention retains native image resolution at lower compute. The series is released in 3B, 7B, and 72B variants; the 72B model performs on par with leading multimodal systems on document/diagram tasks [<xref ref-type="bibr" rid="ref51">51</xref>]. We used the 3B and 7B variants.</p><p>The model was fine-tuned using the Qwen2.5-VL-3B-Instruct base model with Low-Rank Adaptation (LoRA) [<xref ref-type="bibr" rid="ref52">52</xref>] to efficiently adapt the VLM for suicide-risk prediction. The LoRA configuration used a rank (r) of 16 with a scaling factor (&#x03B1;) of 32, targeting the query, key, value, and output projection layers (q_proj, v_proj, k_proj, o_proj) of the attention mechanism. Training was conducted for 3 epochs with a batch size of 2, using the AdamW optimizer with a learning rate of 1e-5 and weight decay of 0.01. Gradient clipping was applied with a maximum norm of 0.5 to ensure stable training. Input images were resized to 224&#x00D7;336 pixels.</p></sec><sec id="s2-12"><title>Qwen3 (Large Language Model Series)</title><p>Qwen3 comprises dense and mixture-of-experts large language models from 0.6B to 235B parameters. A unified design supports both a reasoning (&#x201C;thinking&#x201D;) mode and a fast conversational (&#x201C;non-thinking&#x201D;) mode, with a user-controllable &#x201C;thinking budget&#x201D; to trade off latency and accuracy. Qwen3 reports state-of-the-art results across coding, math, and agent tasks; expanded coverage of 119 languages; and Apache-2.0 licensing [<xref ref-type="bibr" rid="ref53">53</xref>]. We used the 1.7B and 4B variants. For zero-shot learning only, we applied the 30B-A3B variant (using a different prompt; see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>The text-based suicide risk prediction model used Qwen3-1.7B as the base language model, fine-tuned with LoRA targeting a comprehensive set of attention and feed-forward network modules (q_proj, v_proj, k_proj, o_proj, gate_proj, up_proj, down_proj). The LoRA configuration used a rank of 16 with an alpha scaling factor of 32 and dropout rate of 0.1. Training was conducted for up to 10 epochs with an effective batch size of 8 (batch size of 2 with gradient accumulation over 4 steps), using the AdamW optimizer with a learning rate of 2e-6, weight decay of 0.01, and gradient clipping at 0.5. The model used mixed precision training with bfloat16 and implemented a linear learning rate schedule with 10% warmup steps. Text inputs were tokenized with a maximum sequence length of 2048 tokens and early stopping based on EMA-level AUC performance (patience of 5 epochs, minimum delta of 0.001).</p></sec><sec id="s2-13"><title>Liquid AI Foundation Models (LFM2)</title><p>LFM2 are compact, on-device large language models using a hybrid architecture (10 multiplicative-gated convolutional blocks plus 6 grouped-query attention blocks discovered via STAR NAS), trained on ~10T tokens with knowledge distillation. Released at 350M, 700M, and 1.2B parameters. On automated benchmarks, the 350M model is competitive with Qwen3-0.6B and Llama-3.2-1B, while the 1.2B model approaches Qwen3-1.7B [<xref ref-type="bibr" rid="ref54">54</xref>].</p></sec><sec id="s2-14"><title>Liquid AI LFM2-VL (Vision-Language Models)</title><p>LFM2-VL extends LFM2 with a SigLIP-2 NaFlex vision encoder and a lightweight projector, offering 2 open-weight variants, 450M and 1.6B parameters. Models process images at native resolution up to 512&#x00D7;512 with patching and an optional thumbnail for global context and report up to 2&#x00D7;faster GPU inference than similarly sized VLMs. Reported results include RealWorldQA &#x2248;52.3% for the 450M model and &#x2248;65.2% for the 1.6B model, competitive within the ~0.5&#x2013;2B class [<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>].</p><p>The image-based suicide risk prediction model was fine-tuned using the LFM2-VL-450M vision-language foundation model as the base architecture. Fine-tuning was performed using the Transformer Reinforcement Learning Supervised Fine-Tuning framework with LoRA applied to enhance parameter efficiency. The LoRA configuration used a rank (r) of 16 and scaling factor (alpha) of 32, targeting attention and feed-forward modules, including q_proj, k_proj, v_proj, o_proj, fc1, fc2, gate_proj, up_proj, and down_proj. Training was conducted for 3 epochs with a batch size of 4 and gradient accumulation over 4 steps (effective batch=16), using the AdamW optimizer with a learning rate of 2&#x00D7;10&#x207B;&#x2075;, weight decay of 0.01, and a cosine learning-rate scheduler with 10% warm-up. Training used bfloat16 mixed precision with TF32 enabled and applied gradient clipping (max norm=0.5) to ensure numerical stability. Input images were dynamically resized to 224&#x00D7;336 pixels, and training/validation splits followed a 50/50 and 70/30 data division. Model evaluation was based on accuracy and AUC, computed on a held-out validation subset. All fine-tuning and evaluation were executed on a single-GPU SLURM node using efficient on-the-fly image loading.</p><p>The text-only suicide risk prediction model was trained on a large-scale dataset using the LFM-2 Text framework. The configuration used <italic>r</italic>=64, alpha=128, and dropout=0.05, targeting key attention and MLP projection modules (q_proj, k_proj, v_proj, o_proj, gate_proj, up_proj, down_proj). Training was performed for 1 epoch on the full dataset without intermediate evaluations to maximize throughput, using the AdamW optimizer with a learning rate of 2&#x00D7;10&#x207B;&#x2076;, weight decay of 0.01, and gradient accumulation steps of 4 (effective batch=8). Mixed precision training with bfloat16 and TF32 acceleration was used throughout. The model incorporated gradient checkpointing for memory efficiency and used a cosine learning-rate schedule with a warm-up ratio of 0.1. Text inputs were tokenized with a maximum sequence length of 2048 tokens, and final model evaluation was conducted posttraining using a separate validation pipeline. Training used bits-and-bytes quantization (4-bit or 8-bit configurable) for efficient large-scale optimization.</p></sec><sec id="s2-15"><title>Secondary Analyses</title><sec id="s2-15-1"><title>Temporal and Behavioral Confounding</title><p>To assess the extent to which prediction could be driven by temporal or behavioral regularities rather than semantic content, we trained gradient-boosted decision tree baselines using XGBoost (XGBClassifier) [<xref ref-type="bibr" rid="ref57">57</xref>]. These models served as nonsemantic baselines for (1) SI prediction using temporal/behavioral features and (2) EMA missingness prediction. All XGBoost models used 100 trees, a maximum depth of 5, a learning rate of 0.1, histogram-based tree construction (tree_method=hist), and log loss as the objective function, with no class weighting applied. Features for the confounding baseline included cyclical encodings of hour-of-day and day-of-week (sine and cosine), a weekend indicator, and the number of screenshots within each EMA window. The missingness baseline additionally included coarse time-of-day categories and study day index. Models were trained on the full training split in a single fit (no epochs or early stopping) with a fixed random seed (42), using GPU acceleration when available and CPU otherwise. These baselines provide a lower-bound comparison for semantic models by capturing timing and usage intensity without access to screen content.</p></sec><sec id="s2-15-2"><title>Lexical Screening for Between-Person Prediction</title><p>Initial experiments established baselines using random selection and evenly spaced temporal sampling of 30 screenshots per 120-minute window preceding each assessment (AUC=0.599). We then implemented dictionary-based selection using a 276-term crisis [<xref ref-type="bibr" rid="ref16">16</xref>] dictionary across seven categories (suicidal thoughts, nonsubstance methods, substance use, sleep, help-seeking, hopelessness, and general risk), prioritizing screenshots with the highest crisis term counts (AUC=0.615, AUPRC=0.452 vs 0.310 baseline). Enhanced approaches incorporating temporal weighting (recent screenshots weighted 1.0 declining to 0.5), context windows (100 characters around crisis terms), and similarity-based diversity enforcement (70% threshold) reduced performance (AUC=0.597). Finally, we developed a data-driven approach that learned predictive terms directly from screenshot-level TF-IDF features using logistic regression on individual screenshots labeled by their associated SI scores, then used the discovered terms&#x2019; coefficients to score and select screenshots for final EMA-level prediction, though all methods showed substantial overfitting (train-test AUC gaps of 0.33&#x2010;0.41), suggesting the need for stronger regularization or alternative architectures.</p></sec><sec id="s2-15-3"><title>EMA Missingness Sensitivity Analyses</title><p>Because analyses relied on screenshots linked to completed EMAs (~69% compliance), we conducted sensitivity analyses to assess whether non-response could systematically bias performance estimates. First, to evaluate whether missingness is predictable from the same features used in SI prediction, we trained models to predict EMA completion rather than SI. An XGBoost baseline used temporal and behavioral features (cyclical encodings of hour-of-day and day-of-week, weekend indicator, screenshot count per EMA window, coarse time-of-day categories, and study day index). We then trained a separate VLM (Qwen2.5-VL) to predict EMA completion from screenshot content, using the same fine-tuning procedure described above, to assess whether screen content additionally predicted missingness. Second, to directly test whether the SI prediction model produced systematically different risk estimates for completed versus missed EMAs, we applied the trained SI temporal holdout model (Qwen2.5-VL) to all screenshots in the test set, regardless of whether the corresponding EMA was completed. Predictions were aggregated to the EMA level (mean) and compared between completed and missed EMAs using a mixed effects model (predicted risk~completion status, random intercept for participant) and within-person effect sizes (person-centered Cohen <italic>d</italic>). Phone use intensity (screenshot count per 2-hour window) was compared using the same specification.</p></sec></sec><sec id="s2-16"><title>Interpretation</title><p>We first identified the top 1000 model-predicted probability screenshots from the Qwen-VL 70/30 model and passed each image through a Qwen-VL analysis script that applied a fixed, suicide-relevant prompt to produce structured natural-language descriptions of the on-screen content (platform/app, interaction type, safety dialogs, crisis/help-seeking language, and contextual cues; see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>); these descriptions comprised the text corpus for interpretation. After text cleaning (lowercasing, boilerplate removal, and retention of platform and suicide-relevant terms), we vectorized the corpus and ran a sweep of topic models, Latent Dirichlet Allocation (LDA) [<xref ref-type="bibr" rid="ref58">58</xref>] on count features and nonnegative matrix factorization (NMF) on TF-IDF features, across candidate solutions (k &#x2248; 5&#x2010;15) to recover a compact, coherent set of suicide-relevant digital contexts. Each topic from the LDA and NMF runs was then graded into one of 4 salience bands&#x2014;high (explicit crisis/help/safety), medium (operationally adjacent screens, such as platform safety systems or multi-app assessments), support (coping/help content), and subtle (ambient/contextual screens with weaker signal)&#x2014;using the top terms and exemplar descriptions. Finally, we rendered smartphone-style synthetic mockups for each topic using DALL-E 3, instantiating that topic&#x2019;s key user interface elements (eg, an Instagram [Meta Corp] post with a self-harm warning, a crisis chat thread, or an assessment-like mobile card) so VLM-derived topics could be visually inspected for interpretability. These mockups are illustrative visualizations of content categories and do not depict actual participant screenshots; topic labels should be interpreted as general thematic clusters rather than literal content descriptions.</p></sec><sec id="s2-17"><title>Ethical Considerations</title><p>This study was approved by the institutional review board at the University of Notre Dame (#21-12-6965) and the University of Wisconsin&#x2013;Madison (#2024-1031). All procedures were conducted in accordance with the Declaration of Helsinki. During the informed consent process, participants were fully informed of all data collection procedures, including screenshot capture every 5 seconds during active phone use, the possibility of capturing sensitive data, data storage procedures, and steps taken to protect participant confidentiality (ie, limited viewing of raw screenshot files). Limits of confidentiality were also discussed, including circumstances in which the research team was concerned for a participant's immediate safety. All participants received crisis resources at enrollment; if nonzero active SI was reported via ecological momentary assessment, an automated pop-up of crisis resources was provided, and if high levels of active SI (ie, &#x2265;4 out of 5) were reported, the study team conducted a comprehensive risk assessment. To protect participant privacy, raw screenshot data are not publicly available, and all model training and evaluation were conducted within a Health Insurance Portability and Accountability Act (HIPAA)&#x2013;secure computing environment. Synthetic images used for interpretation were generated with no real participant data, and all were labeled &#x201C;SYNTHETIC&#x2013;RESEARCH ONLY.&#x201D; Participants were compensated up to $230 for their involvement: $40 for the baseline session, $100 for ecological momentary assessment completion, a $35 incentive bonus for completing at least 75% of ecological momentary assessments, and $55 for the screenshot capture period.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>The AUC and AUPRC values across models and data types are summarized in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Model performance across evaluation conditions. Area under the receiver operating characteristic curve (AUC) and area under the precision-recall curve (AUPRC) for vision-language (images) and text-only (text) models under temporal holdout (70/30) and subject holdout (50/50) evaluation. Positive class (suicidal ideation [SI] present) prevalence: 33.9% in temporal holdout test and 31.0% in the subject holdout test; baseline AUPRC equals prevalence. 95% CIs in brackets computed via participant-level bootstrap (1000 iterations).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Model</td><td align="left" valign="top">Level</td><td align="left" valign="top" colspan="2">Temporal holdout</td><td align="left" valign="top" colspan="2">Subject holdout</td></tr></thead><tbody><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">AUC<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (95% CI)</td><td align="left" valign="top">AUPRC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> (95% CI)</td><td align="left" valign="top">AUC (95% CI)</td><td align="left" valign="top">AUPRC (95% CI)</td></tr><tr><td align="left" valign="top" colspan="6">Images</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen</td><td align="left" valign="top">Screenshot</td><td align="left" valign="top">0.747 (0.671-0.813)</td><td align="left" valign="top">0.659 (0.506-0.773)</td><td align="left" valign="top">0.517 (0.500-0.584)</td><td align="left" valign="top">0.382 (0.249-0.499)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen</td><td align="left" valign="top">EMA<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">0.830 (0.743-0.904)</td><td align="left" valign="top">0.767 (0.580-0.892)</td><td align="left" valign="top">0.498 (0.504-0.680)</td><td align="left" valign="top">0.355 (0.202-0.435)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LFM2<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">Screenshot</td><td align="left" valign="top">0.778 (0.692-0.842)</td><td align="left" valign="top">0.695 (0.509-0.800)</td><td align="left" valign="top">0.530 (0.491-0.568)</td><td align="left" valign="top">0.310 (0.225-0.471)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LFM2</td><td align="left" valign="top">EMA</td><td align="left" valign="top">0.806 (0.690-0.881)</td><td align="left" valign="top">0.752 (0.589-0.860)</td><td align="left" valign="top">0.511 (0.490-0.620)</td><td align="left" valign="top">0.312 (0.202-0.429)</td></tr><tr><td align="left" valign="top" colspan="2">Text</td><td align="left" valign="top" colspan="4"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen</td><td align="left" valign="top">Screenshot</td><td align="left" valign="top">0.683 (0.616-0.739)</td><td align="left" valign="top">0.586 (0.418-0.745)</td><td align="left" valign="top">0.524<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> (0.500-0.605)</td><td align="left" valign="top">0.283<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> (0.248-0.595)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Qwen</td><td align="left" valign="top">EMA</td><td align="left" valign="top">0.793 (0.791-0.859)</td><td align="left" valign="top">0.718 (0.507-0.816)</td><td align="left" valign="top">0.563<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> (0.488-0.697)</td><td align="left" valign="top">0.312<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> (0.234-0.582)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LFM2</td><td align="left" valign="top">Screenshot</td><td align="left" valign="top">0.488 (0.471-0.505)</td><td align="left" valign="top">0.315 (0.218-0.416)</td><td align="left" valign="top">0.521 (0.497-0.589)</td><td align="left" valign="top">0.390 (0.252-0.597)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LFM2</td><td align="left" valign="top">EMA</td><td align="left" valign="top">0.580 (0.606-0.652)</td><td align="left" valign="top">0.306 (0.202-0.435)</td><td align="left" valign="top">0.550 (0.488-0.697)</td><td align="left" valign="top">0.380 (0.235-0.583)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>AUC: area under the receiver operating characteristic curve.</p></fn><fn id="table1fn2"><p><sup>b</sup>AUPRC: area under the precision-recall curve.</p></fn><fn id="table1fn3"><p><sup>c</sup>EMA: ecological momentary assessments.</p></fn><fn id="table1fn4"><p><sup>d</sup>LFM2: Liquid AI Foundation Model.</p></fn><fn id="table1fn5"><p><sup>e</sup>Models using high-level Florence-2 features rather than raw OCR text.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Temporal Holdout Evaluation</title><sec id="s3-2-1"><title>Image</title><p>Using Qwen2.5-VL, at the screenshot level, the model evidenced an AUC of 0.747 and an AUPRC of 0.659. At the EMA level, there were similar results across several summary statistics, with the mean performing among the best, with an AUC of 0.830 and AUPRC of 0.767, substantially exceeding the baseline of 0.339 (positive class prevalence). Not much was lost with the application of the LFM2-VL model, with an AUC of 0.778 and AUPRC of 0.695 at the screenshot level and an AUC of 0.806 and AUPRC of 0.752 using the mean at the EMA level.</p></sec><sec id="s3-2-2"><title>Text</title><p>Using Qwen3, performance was somewhat attenuated relative to the image-based models but still showed meaningful discrimination. At the screenshot level, the model achieved an AUC of 0.683 and an AUPRC of 0.586. Aggregating to the EMA level improved performance, mirroring the pattern seen in the image analyses: the EMA-level summary reached an AUC of 0.793 and an AUPRC of 0.718. Somewhat surprisingly, LFM2 performed markedly worse, with screenshot-level results around chance, while the aggregated metrics were still far below the VL results.</p></sec><sec id="s3-2-3"><title>Comparison</title><p>Image-based models showed a small advantage over text-based models. At the EMA level, the Qwen image model (AUC=0.83, 95% CI 0.74-0.90) outperformed the Qwen text model (AUC=0.79, 95% CI 0.70-0.86), with a difference of &#x0394;AUC=0.04 (95% CI 0.003-0.07). Comparing vision-language architectures, LFM2 performed comparably to Qwen (AUC=0.80, 95% CI 0.71-0.89; &#x0394;AUC=0.02), supporting feasible on-device deployment without substantial performance loss. All confidence intervals were computed using participant-level bootstrap (1000 iterations) to account for clustering within individuals.</p></sec><sec id="s3-2-4"><title>Calibration</title><p>For the best-performing model (Qwen image, temporal holdout), calibration was assessed at the EMA level (see <xref ref-type="fig" rid="figure3">Figure 3</xref>). The calibration slope was 4.89 (ideal=1), indicating substantial underconfidence. Predicted probabilities were compressed toward the center relative to observed outcomes. The calibration intercept was &#x2212;2.24 (ideal=0), and the Brier score was 0.147. At the screenshot level, the calibration slope was closer to unity (1.90), but overall accuracy was lower (Brier=0.258), reflecting noisier frame-level predictions that EMA-level aggregation sharpens at the cost of increased probability compression. We evaluated whether post-hoc Platt scaling could correct this miscalibration by fitting a logistic recalibration model on a held-out calibration subset (n=523 EMAs) and applying it to an evaluation subset (n=600 EMAs). Platt scaling did not substantially improve calibration (postrecalibration slope=4.74, Brier=0.146), suggesting that the probability compression induced by aggregating screenshot-level predictions to EMA-level summaries requires more sophisticated calibration approaches than simple monotonic transformations.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Calibration curves for the best-performing model (Qwen2.5-VL, temporal holdout) at ecological momentary assessment (EMA) level (left) and screenshot level (right). Each point represents one decile bin of predicted probabilities; the dashed diagonal indicates perfect calibration. At the EMA level, predictions are monotonically related to observed suicidal ideation rates but exhibit systematic underconfidence (calibration slope=4.89), with low-risk predictions slightly overestimating and high-risk predictions substantially underestimating observed event rates. At the screenshot level, predicted probabilities are heavily compressed: approximately 80% of observations fall below 0.15, with a small proportion near 1.0, illustrating the probability compression that EMA-level aggregation partially corrects. EMA: ecological momentary assessment.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mental_v13i1e90581_fig03.png"/></fig></sec></sec><sec id="s3-3"><title>Subject Holdout Evaluation</title><sec id="s3-3-1"><title>Image</title><p>Compared to the temporal holdout models, the subject holdout models showed clear attenuation in performance. Using Qwen2.5-VL on image features at the screenshot level, the model achieved an AUC of 0.517 and an AUPRC of 0.382. Aggregating to the EMA level did not improve performance (AUC=0.498; AUPRC=0.355), suggesting that in the between-person setting, there was limited additional discriminative signal to recover from averaging across a person&#x2019;s screenshots. AUPRC values (0.31&#x2010;0.39) were near the baseline prevalence of 0.31, indicating minimal discrimination beyond chance. For the LFM2-VL image models, performance can be considered at chance.</p></sec><sec id="s3-3-2"><title>Text</title><p>A similar pattern was observed for text-only features. At the screenshot level, the Qwen text model produced an AUC of 0.524 and an AUPRC of 0.283, indicating only marginal separation. EMA-level aggregation improved things slightly (AUC=0.563; AUPRC=0.312), but performance still lagged the temporal holdout evaluations. Of note, this performance was based on application to the high-level text features, not raw text (AUC=0.467; AUPRC=0.354). As with the image models, the LFM2-VL text models in this between-person context performed similarly, but with higher AUPRC values (0.390 and 0.380) and were trained on the raw text.</p></sec></sec><sec id="s3-4"><title>Zero-Shot Learning</title><p>At the image level, LLaMA risk scores demonstrated minimal predictive ability (AUC=0.511; AUPRC=0.410). At the EMA level, the mean risk score across images within an EMA showed the strongest predictive ability (AUC=0.578; AUPRC=0.436), followed by SD (AUC=0.568; AUPRC=0.427). Zero-shot models were evaluated without any training or fine-tuning on data from this study, relying entirely on pretrained weights. Their weak performance likely reflects the absence of individual-specific calibration, which limits the model&#x2019;s ability to learn person-specific baselines and idiosyncratic digital behavior patterns.</p><p>Applied only to the 50/50 test set, we applied Qwen3-30B-A3B to both the text and images. For the text, at the screenshot level, the AUC was 0.541 and AUPRC of 0.343, and at the EMA level (mean), the AUC was 0.583 and AUPRC of 0.385. In images, the screenshot AUC was 0.518 with an AUPRC of 0.329, and at the EMA level (mean), the AUC was 0.552 and the AUPRC was 0.398.</p><p>Performance of Mental-FLAN-T5&#x2014;despite its strong prior results on social media text&#x2014;was limited on screenshot OCR; at the screenshot level, discrimination was near chance (AUC=0.504; AUPRC=0.322). Aggregating to the EMA level using mean scores yielded modest improvement (AUC=0.558; AUPRC=0.360), suggesting that even specialized mental health models do not transfer effectively to passively collected screenshot content.</p></sec><sec id="s3-5"><title>Sensitivity and Secondary Analyses</title><sec id="s3-5-1"><title>Within-Person Versus Between-Person Discrimination</title><p>To assess the extent to which pooled performance reflects within-person temporal discrimination versus between-person differences in baseline risk, we computed person-level AUCs for participants with outcome variability in the temporal holdout test set (reported here for the best-performing model, Qwen2.5-VL with EMA-level mean aggregation). Each participant contributed a median of 13 test EMAs (IQR 7&#x2010;26; range 1&#x2010;48). Of 64 participants, 19 had all-zero test outcomes and 6 had all-positive, reflecting genuine individual differences in SI base rates given the reasonably sized per-person test sets, leaving 39 out of 64 (61%) with computable person-level AUCs. The median person-level AUC was 0.70 (mean 0.70, SD 0.13; IQR 0.60&#x2010;0.80), indicating that the model discriminated higher- from lower-risk time points within individuals above chance. A between-person-only analysis that replaced each observation&#x2019;s prediction with the participant&#x2019;s mean predicted probability yielded an AUC of 0.86, indicating that a substantial portion of the pooled AUC (0.83) reflects the model&#x2019;s calibration to person-specific base rates. This decomposition is consistent with the intended deployment scenario: a calibration period allows the model to learn individual baselines, while temporal variation in predictions provides additional within-person discrimination for ongoing monitoring.</p></sec><sec id="s3-5-2"><title>Temporal and Behavioral Confounding</title><p>To assess whether prediction was driven by temporal or behavioral regularities rather than semantic content, we compared fine-tuned VLMs against a non-semantic baseline using only temporal and usage features (cyclical encodings of hour-of-day and day-of-week, weekend indicator, and screenshot count per EMA window). The temporal baseline achieved near-chance discrimination (AUC=0.52; AUPRC=0.39 for temporal holdout; AUC=0.51; AUPRC=0.37 for subject holdout), with AUPRC values approximating class prevalence as expected for uninformative classifiers. In contrast, models processing screenshot content achieved substantially higher discrimination in temporal holdout: &#x0394;AUC =+0.31; &#x0394;AUPRC =+0.37 for Qwen image (EMA-level); &#x0394;AUC =+0.28; &#x0394;AUPRC =+0.36 for LFM2 image; and &#x0394;AUC =+0.27; &#x0394;AUPRC =+0.33 for Qwen text. The minimal improvements observed in subject holdout (&#x0394;AUC&#x2264;0.05; &#x0394;AUPRC near zero or negative) reflect the previously noted generalization challenge rather than confounding, as the temporal baseline performed equivalently poorly across both evaluation strategies.</p></sec><sec id="s3-5-3"><title>Lexical Screening for Between-Person Prediction</title><p>Given the weak performance of the between-person models, formal comparisons across model architectures were not pursued; instead, we conducted exploratory follow-up analyses using substantially simplified approaches (see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Specifically, we applied a dictionary-based screening procedure that used a 276-term crisis lexicon [<xref ref-type="bibr" rid="ref16">16</xref>] spanning 7 categories (suicidal thoughts, nonsubstance methods, substance use, sleep, help-seeking, hopelessness, and general risk) to identify screenshots with the highest concentration of crisis-relevant terms. We then selected the 30 screenshots with the largest term counts and fit regularized logistic regression models to tf&#x2013;idf representations of the extracted text. Performance was modest, yielding an AUC of 0.615 and an AUPRC of 0.452.</p></sec><sec id="s3-5-4"><title>EMA Missingness</title><p>To assess whether missingness could bias performance estimates, we examined predictions from 2 angles. First, models trained to predict EMA completion (rather than SI) from the same temporal and behavioral features achieved modest discrimination (AUC=0.60); adding VLM-derived content features yielded only marginal improvement (AUC=0.66), indicating that missingness is only weakly predictable from the features used in SI prediction. Second, we applied the trained temporal holdout model (Qwen2.5-VL) to all screenshots in the test set, regardless of whether the corresponding EMA was completed. In the test set (completion rate=61.7%; 1111 of 1802 EMAs), predicted SI risk did not differ systematically between completed and missed EMAs (within-person Cohen <italic>d</italic>=0.03; mixed effects <italic>&#x03B2;</italic>=.005; <italic>t</italic>=0.60). Phone use intensity was modestly higher before completed EMAs (mean 444 vs 411 screenshots per 2-hour window; <italic>&#x03B2;</italic>=55.8; <italic>t</italic>=3.19), consistent with greater phone engagement preceding EMA response. However, conditional on having screenshots in the window, model-predicted risk was comparable regardless of completion status.</p></sec><sec id="s3-5-5"><title>Follow-Up Interpretation</title><p>Applying this pipeline to the 1000 Qwen-VL&#x2013;described screenshots produced a 9-topic solution in the LDA: (1) Instagram safety features, (2) platform safety systems (both graded Medium because they are crisis-adjacent but often triggered by platform logic), (3) crisis conversations, (4) social media crisis posts (graded high due to explicit crisis/help language), (5) multiapp risk assessment, (6) support-seeking messages (typically medium-support depending on the amount of explicit distress), (7) wellness/coping content (support), (8) visual risk markers, and (9) casual messaging indicators, which were retained as subtle topics capturing weaker but recurring contextual cues. In parallel, the NMF solution recovered highly similar structures but emphasized behavioral/interactional views of the same screens (eg, direct messaging, assessment interfaces, visual content with warnings, and emotional crisis chat), supporting the conclusion that 2 different topic-modeling families converged on the same underlying set of suicide-relevant screenshot patterns. The synthetic mockups (see <xref ref-type="fig" rid="figure4">Figure 4</xref>) mirrored these graded topics, a social post with a warning overlay for medium safety screens, back-and-forth messaging for high crisis chats, and a carded survey view for assessment topics, confirming that the latent topics mapped to concrete mobile UIs rather than to modeling artifacts.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Synthetic smartphone mockups illustrating each topic. Images were generated using DALL-E 3 to visualize the user interface patterns identified by topic modeling and do not depict actual participant data. Topic labels represent general thematic clusters identified through automated analysis.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mental_v13i1e90581_fig04.png"/></fig></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The present study demonstrates that VLMs applied to smartphone screenshots predict short-term SI with clinically meaningful accuracy. This extends digital phenotyping beyond behavioral metadata [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref59">59</xref>]&#x2014;GPS coordinates, accelerometer traces, and app usage logs&#x2014;to the semantic content of digital experience itself; what people read, write, and view on their phones. Prior work applying natural language processing to suicide risk has largely relied on social media posts labeled as high-risk based on content or forum membership (eg, r/SuicideWatch; [<xref ref-type="bibr" rid="ref60">60</xref>-<xref ref-type="bibr" rid="ref62">62</xref>]), conflating the signal used for prediction with the outcome being predicted [<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>]. By contrast, the present approach predicts prospectively reported SI via EMA, providing a cleaner separation between predictor and criterion. Models trained on each person&#x2019;s own history and evaluated with strict temporal holdouts reached EMA-level AUCs up to ~0.83, indicating that subtle, moment-to-moment variation in on-screen content carries an extractable signal about proximal risk.</p><p>A consistent finding was a performance gap between temporal and subject holdout prediction. Personalized models calibrated to an individual&#x2019;s baseline were markedly more accurate than models applied to new individuals, aligning with the nonergodic character of many psychological processes [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref65">65</xref>] and consistent empirical findings in digital phenotyping [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref66">66</xref>]. This dissociation between model performances suggests a 2-stage clinical architecture. While complex VLMs failed to generalize across individuals (AUC&#x2248;0.50), simple lexical features yielded modest between-person discrimination (AUC=0.615). This implies that universal screening may rely on coarse, &#x201C;nomothetic&#x201D; signals (eg, specific crisis keywords), while precision monitoring requires high-capacity, &#x201C;idiographic&#x201D; models (ie, VLMs; [<xref ref-type="bibr" rid="ref67">67</xref>]) that learn the subtle, pixel-level context of a specific patient&#x2019;s digital life.</p><p>The stark contrast between temporal holdout (AUC&#x2248;0.83) and subject holdout (AUC&#x2248;0.50) prediction challenges the prevailing &#x2019;universal biomarker&#x2019; assumption in digital psychiatry [<xref ref-type="bibr" rid="ref39">39</xref>]. Our results suggest that digital indicators of suicide risk are highly idiosyncratic; a specific app or interaction style that indicates risk for one patient may be benign for another. Consequently, clinical deployment should not rely on static, universal risk calculators, but rather on JITAIs that use a &#x201C;warm-start&#x201D; calibration period [<xref ref-type="bibr" rid="ref29">29</xref>]. This allows the model to learn patient-specific baselines before triggering active interventions. Given the low base rate of suicidal thoughts and behaviors, effective systems will likely combine idiographic and nomothetic cues, particularly for rarer events such as suicidal planning or attempts, where purely person-specific signals may be sparse.</p><p>Comparing modeling approaches, aggregating predictions across the pre-EMA window outperformed single-screenshot decisions, underscoring that temporal patterns carry more signal than isolated frames. Image-based models significantly outperformed text-based models, suggesting that visual context, layout, app interface, and media content capture information beyond what OCR-extracted text alone provides. Among vision-language architectures, the smaller LFM2 matched Qwen at the screenshot level but showed attenuated EMA-level performance, indicating a potential tradeoff where smaller models capture frame-level signal but lose coherence when predictions are aggregated.</p><p>The near-chance performance of Mental-FLAN-T5 highlights the domain specificity of transfer learning in digital phenotyping. Mental-FLAN-T5 was instruction-finetuned on Reddit posts where users explicitly narrate psychological distress in coherent, first-person text. Screenshot OCR, by contrast, yields fragmented streams of UI elements, notifications, and app content that rarely contain explicit mental health language. This suggests that &#x201C;text&#x201D; is not a monolithic modality; models trained on one text source may not generalize to passive text streams without task-specific adaptation. The substantial improvement observed with fine-tuned models in the present study reinforces this conclusion and argues against relying on off-the-shelf mental health NLP tools for novel passive sensing applications. This highlights a fundamental &#x201C;domain gap&#x201D; in computational psychiatry. Models trained on performative social media posts (eg, r/SuicideWatch) fail to recognize the fragmented, multimodal reality of localized distress (eg, viewing platform safety pop-ups, navigating wellness apps, or ambiguous casual messaging).</p><p>To probe what drives model predictions, we applied topic modeling to VLM-generated screenshot descriptions. The resulting clusters aligned with clinically meaningful content, crisis resources, self-harm search behavior, late-night messaging, and platform safety interventions [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref68">68</xref>,<xref ref-type="bibr" rid="ref69">69</xref>]. This suggests the models are not exploiting spurious artifacts but responding to content plausibly linked to acute distress. While not a formal explanation method, this analysis provides preliminary evidence that predictions are interpretable in terms recognizable to clinicians.</p></sec><sec id="s4-2"><title>Strengths and Innovation</title><p>To our knowledge, this is the first application of VLMs to passively collected smartphone screenshots for momentary suicide risk prediction. The dataset comprises ~7.5 million screenshots (~2.5 million within 2 hours of EMAs) from 70 high-risk participants, substantially larger than typical intensive longitudinal samples. By learning directly from pixel and text content rather than hand-engineered proxies, the approach captures information inaccessible to conventional sensor streams. Our evaluation used deployment-consistent temporal splits, and we found that window-level aggregation outperformed single-screenshot decisions, suggesting that patterns over time, not isolated screens, carry most signal.</p></sec><sec id="s4-3"><title>Key Limitations and Fairness</title><p>A principal limitation is the sample size at the person level (n=70). Although the dataset provided substantial power for within-person relations, we were severely underpowered to test whether relationships differ across demographic or socioeconomic subgroups. This is concerning given evidence that passive-sensing features can have population-specific associations with mental health; for example, [<xref ref-type="bibr" rid="ref70">70</xref>] found mobility patterns that were protective for used/insured individuals paradoxically predicted higher depression risk for unemployed, uninsured, and low-income groups. Given that our sample was predominantly White (84.8%) and recruited from a single Midwestern region, the &#x201C;digital phenotypes&#x201D; discovered here (eg, specific wellness apps or communication styles) may reflect cultural norms that do not generalize. Future work must validate these semantic signals in diverse cohorts to ensure algorithmic equity. This limitation also constrains interpretation of subject holdout performance; with approximately 35 participants per split, we cannot distinguish whether near-chance between-person prediction reflects fundamental idiosyncrasy of digital behaviors or insufficient training data to learn cross-person regularities. Larger multisite samples are needed to adjudicate.</p><p>Additionally, we only analyzed screenshots linked to completed EMAs and did not explicitly model the missingness of the EMA outcome. Prior work shows higher phone use predicts higher EMA response [<xref ref-type="bibr" rid="ref17">17</xref>] and that response rates vary with study characteristics (eg, decline with longer inclusion) [<xref ref-type="bibr" rid="ref71">71</xref>], implying that nonresponse is predictable from vision/language features and study design. In related analyses (unpublished data), we found no evidence that capture degraded over time. This high adherence rate (&#x003E;99% retention of capture capability) suggests that, despite the intrusive nature of continuous screenshotting, high-risk clinical populations may find &#x201C;passive&#x201D; surveillance more acceptable than the burden of active logging, provided (as in this study) strict privacy protocols are transparently communicated.</p><p>Future work should model EMA missingness jointly with risk (eg, selection or pattern-mixture approaches) or leverage screenshot-derived features for imputation/reweighting so that performance estimates reflect the full user experience. Our primary outcome was SI reported via EMA, not suicidal behavior; most ideation does not progress to action [<xref ref-type="bibr" rid="ref72">72</xref>], making these models better suited as just-in-time intervention triggers (ie, targeting suicidal thinking; [<xref ref-type="bibr" rid="ref73">73</xref>]) than standalone risk predictors. Finally, computational constraints inherent to fine-tuning foundation models on millions of images precluded extensive hyperparameter search, cross-validation, and sensitivity analyses (eg, varying the pre-EMA time window). Results should be interpreted as proof-of-concept under a specific configuration rather than optimized performance.</p><p>We examined whether EMA missingness could bias performance estimates. In a companion analysis (unpublished data), screenshot availability during EMA nonresponse showed no associations with suicide risk dynamics (mean levels, variability, and instability), suggesting non-selective coverage. Additionally, we trained models to predict EMA completion from temporal features and screenshot content; discrimination was modest (AUC=0.60 for temporal baseline, 0.66 with VLM), indicating that missingness is only weakly predictable from the features used in SI prediction. Together, these findings suggest that the ~31% missing EMAs are unlikely to substantially bias performance estimates, though future work should consider joint modeling approaches.</p></sec><sec id="s4-4"><title>Toward Privacy-Preserving Deployment</title><p>Although adequate predictive accuracy relied on within-individual fine-tuning, full on-device training of VLMs is not currently feasible on consumer smartphones due to memory, energy, and training time constraints. In this study, fine-tuning&#x2014;even for compact models such as LFM2-VL&#x2014;required GPU acceleration and large-scale batched optimization, exceeding the capabilities of mobile hardware. However, inference-only deployment of smaller VLMs is already feasible on modern devices, and our results show that reduced-capacity models retain strong within-person predictive performance. In practice, personalization could occur during an initial calibration phase using secure cloud-based resources, after which inference can be performed entirely on-device without transmitting raw screenshots. Alternative strategies, such as warm-start calibration, lightweight adapter tuning, or federated learning, may further reduce computational and privacy burdens while preserving individual-specific sensitivity.</p><p>Continuous screenshot capture is intrinsically sensitive, requiring multiple layers of technical safeguards. The smaller LFM2-VL model matched or exceeded larger architectures on temporal holdout prediction, enabling a privacy-first architecture where sensitive pixel data never leaves the patient&#x2019;s phone; only computed risk scores would be transmitted to clinicians. Additional safeguards include ephemeral processing, where screenshots are held only in volatile memory during inference and immediately discarded, preventing recovery even if the device is compromised. Content-aware redaction can automatically mask identifiable information (faces, names, and financial data) before any storage, and our topic modeling results suggest that clinically relevant signals concentrate in specific content categories (crisis conversations and platform safety warnings), potentially enabling selective capture that preserves predictive utility while minimizing data exposure. For applications requiring formal guarantees, differential privacy mechanisms could be integrated during model training and inference to prevent reconstruction of individual screenshots from model outputs.</p><p>Effective deployment also requires stakeholder co-design extending beyond technical safeguards [<xref ref-type="bibr" rid="ref74">74</xref>,<xref ref-type="bibr" rid="ref75">75</xref>]. Participants should have granular, reversible control over data sharing, specifying which content categories to monitor, at what temporal resolution, and for what duration, with transparent interfaces showing what data are captured and transmitted. Given documented disparities in passive sensing model performance across demographic groups [<xref ref-type="bibr" rid="ref70">70</xref>], co-design processes must include diverse patient populations to ensure privacy-preserving systems do not inadvertently exacerbate health inequities. Balancing detection accuracy against false alarm burden, encryption for data in transit, and threshold co-design with end users and clinicians will be essential for responsible clinical translation.</p></sec><sec id="s4-5"><title>Clinical Deployment Considerations</title><p>Translating screenshot-based risk detection into clinical practice requires addressing the calibration limitations observed in this study. The substantial probability compression (calibration slope=4.71) means that model outputs do not correspond to true event probabilities; until calibration is improved through alternative aggregation strategies or sufficient individual-level calibration data, deployment systems should treat outputs as ordinal risk rankings rather than calibrated probabilities. We recommend that model outputs inform, not replace, clinical judgment: risk-event feeds could route to care-management dashboards where clinicians review alerts and determine appropriate outreach [<xref ref-type="bibr" rid="ref20">20</xref>], with integration into existing clinical workflows remaining an important implementation challenge. The modular architecture demonstrated here, with separable feature extraction, temporal aggregation, and risk scoring, is compatible with JITAI frameworks [<xref ref-type="bibr" rid="ref76">76</xref>], though integration with intervention delivery systems remains to be tested. From a model safety perspective, the fine-tuned models in this study output scalar risk probabilities rather than natural language, limiting the potential for amplifying self-harm content. Deployment in clinical systems should nonetheless incorporate content filtering on any model-generated text, restrict outputs to ordinal risk rankings rather than explanatory narratives, and require clinician-in-the-loop review before any intervention is triggered.</p></sec><sec id="s4-6"><title>Conclusions</title><p>Screenshot content predicts short-term SI with modest but reliable accuracy. Fine-tuned VLMs outperformed zero-shot approaches, and temporal holdout evaluation yielded stronger discrimination than subject holdout generalization. These results establish that passively captured screen content, despite its fragmentary, noisy nature, carries an extractable signal about proximal suicide risk.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This work was funded by R21MH129688.</p></sec><sec><title>Data Availability</title><p>The datasets generated and analyzed during this study are not publicly available due to the highly sensitive nature of raw smartphone screenshots, which contain personal communications and identifiable content that cannot be adequately deidentified. Derived, nonidentifiable model outputs and analysis code are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>RJ performed conceptualization, methodology, software, formal analysis, writing &#x2013; original draft, and visualization. WS handled software, formal analysis, and writing &#x2013; review &#x0026; editing. VK contributed to writing &#x2013; review &#x0026; editing. BA contributed to conceptualization, investigation, data curation, supervision, funding acquisition, and writing &#x2013; review &#x0026; editing.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb2">AUPRC</term><def><p>area under the precision-recall curve</p></def></def-item><def-item><term id="abb3">EMA</term><def><p>ecological momentary assessment</p></def></def-item><def-item><term id="abb4">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb5">JITAI</term><def><p>just-in-time adaptive intervention</p></def></def-item><def-item><term id="abb6">LDA</term><def><p>Latent Dirichlet Allocation</p></def></def-item><def-item><term id="abb7">LFM2</term><def><p>Liquid AI Foundation Model</p></def></def-item><def-item><term id="abb8">LoRA</term><def><p>Low-Rank Adaptation</p></def></def-item><def-item><term id="abb9">NMF</term><def><p>nonnegative matrix factorization</p></def></def-item><def-item><term id="abb10">OCR</term><def><p>optical character recognition</p></def></def-item><def-item><term id="abb11">SI</term><def><p>suicidal ideation</p></def></def-item><def-item><term id="abb12">VLM</term><def><p>vision-language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mohr</surname><given-names>DC</given-names> </name></person-group><article-title>Passive sensing in mental health: challenges and opportunities</article-title><source>J Psychopathol Clin Sci</source><year>2025</year><month>11</month><volume>134</volume><issue>8</issue><fpage>1034</fpage><lpage>1035</lpage><pub-id pub-id-type="doi">10.1037/abn0001058</pub-id><pub-id pub-id-type="medline">41129378</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coppersmith</surname><given-names>DDL</given-names> </name><name name-style="western"><surname>Ryan</surname><given-names>O</given-names> </name><name name-style="western"><surname>Fortgang</surname><given-names>RG</given-names> </name><name name-style="western"><surname>Millner</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Kleiman</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Nock</surname><given-names>MK</given-names> </name></person-group><article-title>Mapping the timescale of suicidal thinking</article-title><source>Proc Natl Acad Sci U S A</source><year>2023</year><month>04</month><day>25</day><volume>120</volume><issue>17</issue><fpage>e2215434120</fpage><pub-id pub-id-type="doi">10.1073/pnas.2215434120</pub-id><pub-id pub-id-type="medline">37071683</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Czyz</surname><given-names>EK</given-names> </name><name name-style="western"><surname>King</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Al-Dajani</surname><given-names>N</given-names> </name><name name-style="western"><surname>Zimmermann</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hong</surname><given-names>V</given-names> </name><name name-style="western"><surname>Nahum-Shani</surname><given-names>I</given-names> </name></person-group><article-title>Ecological momentary assessments and passive sensing in the prediction of short-term suicidal ideation in young adults</article-title><source>JAMA Netw Open</source><year>2023</year><month>08</month><day>1</day><volume>6</volume><issue>8</issue><fpage>e2328005</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.28005</pub-id><pub-id pub-id-type="medline">37552477</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haines-Delmont</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chahal</surname><given-names>G</given-names> </name><name name-style="western"><surname>Bruen</surname><given-names>AJ</given-names> </name><etal/></person-group><article-title>Testing suicide risk prediction algorithms using phone measurements with patients in acute mental health settings: feasibility study</article-title><source>JMIR Mhealth Uhealth</source><year>2020</year><month>06</month><day>26</day><volume>8</volume><issue>6</issue><fpage>e15901</fpage><pub-id pub-id-type="doi">10.2196/15901</pub-id><pub-id pub-id-type="medline">32442152</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Horwitz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Czyz</surname><given-names>E</given-names> </name><name name-style="western"><surname>Al-Dajani</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Utilizing daily mood diaries and wearable sensor data to predict depression and suicidal ideation among medical interns</article-title><source>J Affect Disord</source><year>2022</year><month>09</month><day>15</day><volume>313</volume><fpage>1</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2022.06.064</pub-id><pub-id pub-id-type="medline">35764227</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Horwitz</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Kentopp</surname><given-names>SD</given-names> </name><name name-style="western"><surname>Cleary</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Using machine learning with intensive longitudinal data to predict depression and suicidal ideation among medical interns over time</article-title><source>Psychol Med</source><year>2023</year><month>09</month><volume>53</volume><issue>12</issue><fpage>5778</fpage><lpage>5785</lpage><pub-id pub-id-type="doi">10.1017/S0033291722003014</pub-id><pub-id pub-id-type="medline">36177889</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Auerbach</surname><given-names>RP</given-names> </name><name name-style="western"><surname>Bloom</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Pagliaccio</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Using smartphone GPS data to detect the risk of adolescent suicidal thoughts and behaviors</article-title><source>JAMA Netw Open</source><year>2025</year><month>01</month><day>2</day><volume>8</volume><issue>1</issue><fpage>e2456429</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.56429</pub-id><pub-id pub-id-type="medline">39869336</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Amiriparian</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gerczuk</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lutz</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Non-invasive suicide risk prediction through speech analysis</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 30, 2024</comment><pub-id pub-id-type="doi">10.1109/EHB64556.2024.10805581</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Du</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Temporal-noise-aware neural networks for suicidal ideation prediction using physiological data</article-title><source>IEEE Trans Comput Soc Syst</source><year>2024</year><volume>12</volume><issue>5</issue><fpage>2973</fpage><lpage>2985</lpage><pub-id pub-id-type="doi">10.1109/TCSS.2024.3523928</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Um</surname><given-names>J</given-names> </name><name name-style="western"><surname>Park</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Ahn</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Baek</surname><given-names>JH</given-names> </name></person-group><article-title>Machine learning models to identify individuals with imminent suicide risk using a wearable device: a pilot study</article-title><source>Psychiatry Investig</source><year>2025</year><month>02</month><volume>22</volume><issue>2</issue><fpage>156</fpage><lpage>166</lpage><pub-id pub-id-type="doi">10.30773/pi.2024.0257</pub-id><pub-id pub-id-type="medline">40017279</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sheridan</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Baker</surname><given-names>S</given-names> </name><name name-style="western"><surname>Dehart</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Heart rate variability and its ability to detect worsening suicidality in adolescents: a pilot trial of wearable technology</article-title><source>Psychiatry Investig</source><year>2021</year><month>10</month><volume>18</volume><issue>10</issue><fpage>928</fpage><lpage>935</lpage><pub-id pub-id-type="doi">10.30773/pi.2021.0057</pub-id><pub-id pub-id-type="medline">34555890</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kleiman</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Bentley</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Maimone</surname><given-names>JS</given-names> </name><etal/></person-group><article-title>Can passive measurement of physiological distress help better predict suicidal thinking?</article-title><source>Transl Psychiatry</source><year>2021</year><month>12</month><day>2</day><volume>11</volume><issue>1</issue><fpage>611</fpage><pub-id pub-id-type="doi">10.1038/s41398-021-01730-y</pub-id><pub-id pub-id-type="medline">34857731</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barrigon</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Romero-Medrano</surname><given-names>L</given-names> </name><name name-style="western"><surname>Moreno-Mu&#x00F1;oz</surname><given-names>P</given-names> </name><etal/></person-group><article-title>One-week suicide risk prediction using real-time smartphone monitoring: prospective cohort study</article-title><source>J Med Internet Res</source><year>2023</year><month>09</month><day>1</day><volume>25</volume><fpage>e43719</fpage><pub-id pub-id-type="doi">10.2196/43719</pub-id><pub-id pub-id-type="medline">37656498</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Paz-Arbaizar</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lopez-Castroman</surname><given-names>J</given-names> </name><name name-style="western"><surname>Art&#x00E9;s-Rodr&#x00ED;guez</surname><given-names>A</given-names> </name><name name-style="western"><surname>Olmos</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Ram&#x00ED;rez</surname><given-names>D</given-names> </name></person-group><article-title>Emotion forecasting: a transformer-based approach</article-title><source>J Med Internet Res</source><year>2025</year><month>03</month><day>18</day><volume>27</volume><fpage>e63962</fpage><pub-id pub-id-type="doi">10.2196/63962</pub-id><pub-id pub-id-type="medline">40101216</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kadirvelu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Bellido Bel</surname><given-names>T</given-names> </name><name name-style="western"><surname>Freccero</surname><given-names>A</given-names> </name><name name-style="western"><surname>Di Simplico</surname><given-names>M</given-names> </name><name name-style="western"><surname>Nicholls</surname><given-names>D</given-names> </name><name name-style="western"><surname>Faisal</surname><given-names>AA</given-names> </name></person-group><article-title>Digital phenotyping for adolescent mental health: feasibility study using machine learning to predict mental health risk from active and passive smartphone data</article-title><source>J Med Internet Res</source><year>2026</year><month>02</month><day>4</day><volume>28</volume><fpage>e72501</fpage><pub-id pub-id-type="doi">10.2196/72501</pub-id><pub-id pub-id-type="medline">41637624</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ammerman</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Kleiman</surname><given-names>EM</given-names> </name><name name-style="western"><surname>O&#x2019;Brien</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Smartphone-based text obtained via passive sensing as it relates to direct suicide risk assessment</article-title><source>Psychol Med</source><year>2025</year><month>05</month><day>9</day><volume>55</volume><fpage>e144</fpage><pub-id pub-id-type="doi">10.1017/S0033291725001199</pub-id><pub-id pub-id-type="medline">40340954</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacobucci</surname><given-names>R</given-names> </name><name name-style="western"><surname>Blacutt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ram</surname><given-names>N</given-names> </name><name name-style="western"><surname>Ammerman</surname><given-names>BA</given-names> </name></person-group><article-title>Smartphone screen time and suicide risk in daily life captured through high-resolution screenshot data</article-title><source>NPJ Digit Med</source><year>2025</year><month>05</month><day>29</day><volume>8</volume><issue>1</issue><fpage>321</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01740-w</pub-id><pub-id pub-id-type="medline">40442251</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacobucci</surname><given-names>R</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>SG</given-names> </name><name name-style="western"><surname>Blacutt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ammerman</surname><given-names>BA</given-names> </name></person-group><article-title>Passive vs active nighttime smartphone use as markers of next-day suicide risk</article-title><source>JAMA Netw Open</source><year>2025</year><month>11</month><day>3</day><volume>8</volume><issue>11</issue><fpage>e2542675</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2025.42675</pub-id><pub-id pub-id-type="medline">41217755</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>B&#x00FC;scher</surname><given-names>R</given-names> </name><name name-style="western"><surname>Winkler</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mocellin</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A systematic review on passive sensing for the prediction of suicidal thoughts and behaviors</article-title><source>Npj Ment Health Res</source><year>2024</year><month>09</month><day>23</day><volume>3</volume><issue>1</issue><fpage>42</fpage><pub-id pub-id-type="doi">10.1038/s44184-024-00089-4</pub-id><pub-id pub-id-type="medline">39313519</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Torous</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bucci</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bell</surname><given-names>IH</given-names> </name><etal/></person-group><article-title>The growing field of digital psychiatry: current evidence and the future of apps, social media, chatbots, and virtual reality</article-title><source>World Psychiatry</source><year>2021</year><month>10</month><volume>20</volume><issue>3</issue><fpage>318</fpage><lpage>335</lpage><pub-id pub-id-type="doi">10.1002/wps.20883</pub-id><pub-id pub-id-type="medline">34505369</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Funkhouser</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Trivedi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Li</surname><given-names>LY</given-names> </name><etal/></person-group><article-title>Detecting adolescent depression through passive monitoring of linguistic markers in smartphone communication</article-title><source>J Child Psychol Psychiatry</source><year>2024</year><month>07</month><volume>65</volume><issue>7</issue><fpage>932</fpage><lpage>941</lpage><pub-id pub-id-type="doi">10.1111/jcpp.13931</pub-id><pub-id pub-id-type="medline">38098445</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Knol</surname><given-names>L</given-names> </name><name name-style="western"><surname>Nagpal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Leaning</surname><given-names>IE</given-names> </name><etal/></person-group><article-title>Smartphone keyboard dynamics predict affect in suicidal ideation</article-title><source>NPJ Digit Med</source><year>2024</year><month>03</month><day>1</day><volume>7</volume><issue>1</issue><fpage>54</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01048-1</pub-id><pub-id pub-id-type="medline">38429434</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Ning</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ross</surname><given-names>MK</given-names> </name><etal/></person-group><article-title>Digital phenotypes of mobile keyboard backspace rates and their associations with symptoms of mood disorder: algorithm development and validation</article-title><source>J Med Internet Res</source><year>2024</year><month>10</month><day>29</day><volume>26</volume><fpage>e51269</fpage><pub-id pub-id-type="doi">10.2196/51269</pub-id><pub-id pub-id-type="medline">39471368</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Bloom</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Treves</surname><given-names>IN</given-names> </name><name name-style="western"><surname>Pagliaccio</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Identifying suicide-related language in smartphone keyboard entries among high-risk adolescents</article-title><source>PsyArXiv</source><comment>Preprint posted online on  Sep 2, 2025</comment><pub-id pub-id-type="doi">10.31234/osf.io/gfa7h_v1</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ram</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>MJ</given-names> </name><etal/></person-group><article-title>Screenomics: a new approach for observing and studying individuals&#x2019; digital lives</article-title><source>J Adolesc Res</source><year>2020</year><month>01</month><volume>35</volume><issue>1</issue><fpage>16</fpage><lpage>50</lpage><pub-id pub-id-type="doi">10.1177/0743558419883362</pub-id><pub-id pub-id-type="medline">32161431</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Campbell</surname><given-names>DT</given-names> </name><name name-style="western"><surname>Fiske</surname><given-names>DW</given-names> </name></person-group><article-title>Convergent and discriminant validation by the multitrait-multimethod matrix</article-title><source>Psychol Bull</source><year>1959</year><month>03</month><volume>56</volume><issue>2</issue><fpage>81</fpage><lpage>105</lpage><pub-id pub-id-type="doi">10.1037/h0046016</pub-id><pub-id pub-id-type="medline">13634291</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Molenaar</surname><given-names>PCM</given-names> </name></person-group><article-title>A manifesto on psychology as idiographic science: bringing the person back into scientific psychology, this time forever</article-title><source>Meas Interdiscip Res Perspect</source><year>2004</year><month>10</month><volume>2</volume><issue>4</issue><fpage>201</fpage><lpage>218</lpage><pub-id pub-id-type="doi">10.1207/s15366359mea0204_1</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Balliu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Douglas</surname><given-names>C</given-names> </name><name name-style="western"><surname>Seok</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Personalized mood prediction from patterns of behavior collected with smartphones</article-title><source>NPJ Digit Med</source><year>2024</year><month>02</month><day>28</day><volume>7</volume><issue>1</issue><fpage>49</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01035-6</pub-id><pub-id pub-id-type="medline">38418551</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nahum-Shani</surname><given-names>I</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>SA</given-names> </name></person-group><article-title>Just-in-time adaptive interventions: where are we now and what Is next?</article-title><source>Annu Rev Psychol</source><year>2026</year><month>01</month><volume>77</volume><issue>1</issue><fpage>679</fpage><lpage>703</lpage><pub-id pub-id-type="doi">10.1146/annurev-psych-121024-044244</pub-id><pub-id pub-id-type="medline">40939059</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roberts</surname><given-names>DR</given-names> </name><name name-style="western"><surname>Bahn</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ciuti</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Cross&#x2010;validation strategies for data with temporal, spatial, hierarchical, or phylogenetic structure</article-title><source>Ecography</source><year>2017</year><month>08</month><access-date>2026-03-16</access-date><volume>40</volume><issue>8</issue><fpage>913</fpage><lpage>929</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://nsojournals.onlinelibrary.wiley.com/toc/16000587/40/8">https://nsojournals.onlinelibrary.wiley.com/toc/16000587/40/8</ext-link></comment><pub-id pub-id-type="doi">10.1111/ecog.02881</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Saeb</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lonini</surname><given-names>L</given-names> </name><name name-style="western"><surname>Jayaraman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mohr</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Kording</surname><given-names>KP</given-names> </name></person-group><article-title>The need to approximate the use-case in clinical machine learning</article-title><source>Gigascience</source><year>2017</year><month>05</month><day>1</day><volume>6</volume><issue>5</issue><fpage>1</fpage><lpage>9</lpage><pub-id pub-id-type="doi">10.1093/gigascience/gix019</pub-id><pub-id pub-id-type="medline">28327985</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kapoor</surname><given-names>S</given-names> </name><name name-style="western"><surname>Narayanan</surname><given-names>A</given-names> </name></person-group><article-title>Leakage and the reproducibility crisis in machine-learning-based science</article-title><source>Patterns (N Y)</source><year>2023</year><month>09</month><day>8</day><volume>4</volume><issue>9</issue><fpage>100804</fpage><pub-id pub-id-type="doi">10.1016/j.patter.2023.100804</pub-id><pub-id pub-id-type="medline">37720327</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yee</surname><given-names>AZH</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>SS</given-names> </name><etal/></person-group><article-title>ScreenLife capture: an open-source and user-friendly framework for collecting screenomes from Android smartphones</article-title><source>Behav Res Methods</source><year>2023</year><month>12</month><volume>55</volume><issue>8</issue><fpage>4068</fpage><lpage>4085</lpage><pub-id pub-id-type="doi">10.3758/s13428-022-02006-z</pub-id><pub-id pub-id-type="medline">36289177</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Forkmann</surname><given-names>T</given-names> </name><name name-style="western"><surname>Spangenberg</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rath</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Assessing suicidality in real time: a psychometric evaluation of self-report items for the assessment of suicidal ideation and its proximal risk factors using ecological momentary assessments</article-title><source>J Abnorm Psychol</source><year>2018</year><month>11</month><volume>127</volume><issue>8</issue><fpage>758</fpage><lpage>769</lpage><pub-id pub-id-type="doi">10.1037/abn0000381</pub-id><pub-id pub-id-type="medline">30299116</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bulteel</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mestdagh</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tuerlinckx</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ceulemans</surname><given-names>E</given-names> </name></person-group><article-title>VAR(1) based models do not always outpredict AR(1) models in typical psychological applications</article-title><source>Psychol Methods</source><year>2018</year><month>12</month><volume>23</volume><issue>4</issue><fpage>740</fpage><lpage>756</lpage><pub-id pub-id-type="doi">10.1037/met0000178</pub-id><pub-id pub-id-type="medline">29745683</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kaufman</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rosset</surname><given-names>S</given-names> </name><name name-style="western"><surname>Perlich</surname><given-names>C</given-names> </name></person-group><article-title>Leakage in data mining: formulation, detection, and avoidance</article-title><year>2011</year><conf-name>Proceedings of the 17th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 21-24, 2011</conf-date><conf-loc>San Diego, CA</conf-loc><fpage>556</fpage><lpage>563</lpage><pub-id pub-id-type="doi">10.1145/2020408.2020496</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bergmeir</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ben&#x00ED;tez</surname><given-names>JM</given-names> </name></person-group><article-title>On the use of cross-validation for time series predictor evaluation</article-title><source>Inf Sci (Ny)</source><year>2012</year><month>05</month><volume>191</volume><fpage>192</fpage><lpage>213</lpage><pub-id pub-id-type="doi">10.1016/j.ins.2011.12.028</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fisher</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Medaglia</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Jeronimus</surname><given-names>BF</given-names> </name></person-group><article-title>Lack of group-to-individual generalizability is a threat to human subjects research</article-title><source>Proc Natl Acad Sci U S A</source><year>2018</year><month>07</month><day>3</day><volume>115</volume><issue>27</issue><fpage>E6106</fpage><lpage>E6115</lpage><pub-id pub-id-type="doi">10.1073/pnas.1711978115</pub-id><pub-id pub-id-type="medline">29915059</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Piccirillo</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Rodebaugh</surname><given-names>TL</given-names> </name></person-group><article-title>Foundations of idiographic methods in psychology and applications for psychotherapy</article-title><source>Clin Psychol Rev</source><year>2019</year><month>07</month><volume>71</volume><fpage>90</fpage><lpage>100</lpage><pub-id pub-id-type="doi">10.1016/j.cpr.2019.01.002</pub-id><pub-id pub-id-type="medline">30665765</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Little</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Varoquaux</surname><given-names>G</given-names> </name><name name-style="western"><surname>Saeb</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Using and understanding cross-validation strategies. Perspectives on Saeb et al</article-title><source>Gigascience</source><year>2017</year><month>05</month><day>1</day><volume>6</volume><issue>5</issue><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1093/gigascience/gix020</pub-id><pub-id pub-id-type="medline">28327989</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Rooij</surname><given-names>M</given-names> </name><name name-style="western"><surname>Weeda</surname><given-names>W</given-names> </name></person-group><article-title>Cross-validation: a method every psychologist should know</article-title><source>Adv Methods Pract Psychol Sci</source><year>2020</year><month>06</month><volume>3</volume><issue>2</issue><fpage>248</fpage><lpage>263</lpage><pub-id pub-id-type="doi">10.1177/2515245919898466</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ten-Dom&#x00E9;nech</surname><given-names>I</given-names> </name><name name-style="western"><surname>P&#x00E9;rez-Guaita</surname><given-names>D</given-names> </name><name name-style="western"><surname>Quint&#x00E1;s</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kuligowski</surname><given-names>J</given-names> </name></person-group><article-title>Analysis of longitudinal data using constrained repeated random sampling-cross validation (CORRS-CV) and partial least squares</article-title><source>Chemometr Intell Lab Syst</source><year>2023</year><month>04</month><volume>235</volume><fpage>104776</fpage><pub-id pub-id-type="doi">10.1016/j.chemolab.2023.104776</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tashman</surname><given-names>LJ</given-names> </name></person-group><article-title>Out-of-sample tests of forecasting accuracy: an analysis and review</article-title><source>Int J Forecast</source><year>2000</year><month>10</month><volume>16</volume><issue>4</issue><fpage>437</fpage><lpage>450</lpage><pub-id pub-id-type="doi">10.1016/S0169-2070(00)00065-0</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blacutt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jacobucci</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ammerman</surname><given-names>BA</given-names> </name></person-group><article-title>Modeling the dynamics of perceived burdensomeness, thwarted belongingness, and suicidal ideation in continuous time</article-title><source>J Psychopathol Clin Sci</source><year>2025</year><month>09</month><day>4</day><pub-id pub-id-type="doi">10.1037/abn0001048</pub-id><pub-id pub-id-type="medline">40906035</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Davis</surname><given-names>J</given-names> </name><name name-style="western"><surname>Goadrich</surname><given-names>M</given-names> </name></person-group><article-title>The relationship between precision-recall and ROC curves</article-title><conf-name>23rd International Conference on Machine Learning</conf-name><conf-date>Jun 25-29, 2006</conf-date><conf-loc>Pittsburgh, Pennsylvania, USA</conf-loc><fpage>233</fpage><lpage>240</lpage><pub-id pub-id-type="doi">10.1145/1143844.1143874</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fawcett</surname><given-names>T</given-names> </name></person-group><article-title>An introduction to ROC analysis</article-title><source>Pattern Recognit Lett</source><year>2006</year><month>06</month><volume>27</volume><issue>8</issue><fpage>861</fpage><lpage>874</lpage><pub-id pub-id-type="doi">10.1016/j.patrec.2005.10.010</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Saito</surname><given-names>T</given-names> </name><name name-style="western"><surname>Rehmsmeier</surname><given-names>M</given-names> </name></person-group><article-title>The precision-recall plot is more informative than the ROC plot when evaluating binary classifiers on imbalanced datasets</article-title><source>PLoS One</source><year>2015</year><volume>10</volume><issue>3</issue><fpage>e0118432</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0118432</pub-id><pub-id pub-id-type="medline">25738806</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="web"><article-title>JaidedAI/easyocr</article-title><source>GitHub</source><year>2024</year><access-date>2026-03-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/JaidedAI/EasyOCR">https://github.com/JaidedAI/EasyOCR</ext-link></comment></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xiao</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Florence-2: advancing a unified representation for a variety of vision tasks</article-title><conf-name>2024 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)</conf-name><conf-date>Jun 16-22, 2024</conf-date><pub-id pub-id-type="doi">10.1109/CVPR52733.2024.00461</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>B</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Mental-LLM: leveraging large language models for mental health prediction via online text data</article-title><source>Proc ACM Interact Mob Wearable Ubiquitous Technol</source><year>2024</year><month>03</month><volume>8</volume><issue>1</issue><fpage>31</fpage><pub-id pub-id-type="doi">10.1145/3643540</pub-id><pub-id pub-id-type="medline">39925940</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Bai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Qwen25-VL technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 19, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.13923</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>P</given-names> </name><etal/></person-group><article-title>LoRA: low-rank adaptation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 16, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2106.09685</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Li</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Qwen3 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  May 14, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2505.09388</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="web"><article-title>Introducing LFM2: the fastest on-device foundation models on the market</article-title><source>Liquid AI Blog</source><year>2025</year><month>07</month><day>10</day><access-date>2026-03-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models">https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models</ext-link></comment></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="web"><article-title>LFM2-VL: efficient vision-language models</article-title><source>Liquid AI Blog</source><year>2025</year><month>08</month><day>12</day><access-date>2026-03-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models">https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models</ext-link></comment></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tschannen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gritsenko</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>SigLIP 2: multilingual vision-language encoders with improved semantic understanding, localization, and dense features</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 20, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.14786</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>XGBoost: a scalable tree boosting system</article-title><conf-name>Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining</conf-name><conf-date>Aug 13-17, 2016</conf-date><conf-loc>San Francisco, CA, USA</conf-loc><fpage>785</fpage><lpage>794</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939785</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blei</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Jordan</surname><given-names>MI</given-names> </name></person-group><article-title>Latent dirichlet allocation</article-title><source>J Mach Learn Res</source><year>2003</year><volume>3</volume><fpage>993</fpage><lpage>1022</lpage><pub-id pub-id-type="doi">10.1162/jmlr.2003.3.4-5.993</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacobson</surname><given-names>NC</given-names> </name><name name-style="western"><surname>Weingarden</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wilhelm</surname><given-names>S</given-names> </name></person-group><article-title>Using digital phenotyping to accurately detect depression severity</article-title><source>J Nerv Ment Dis</source><year>2019</year><month>10</month><volume>207</volume><issue>10</issue><fpage>893</fpage><lpage>896</lpage><pub-id pub-id-type="doi">10.1097/NMD.0000000000001042</pub-id><pub-id pub-id-type="medline">31596769</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coppersmith</surname><given-names>G</given-names> </name><name name-style="western"><surname>Leary</surname><given-names>R</given-names> </name><name name-style="western"><surname>Crutchley</surname><given-names>P</given-names> </name><name name-style="western"><surname>Fine</surname><given-names>A</given-names> </name></person-group><article-title>Natural language processing of social media as screening for suicide risk</article-title><source>Biomed Inform Insights</source><year>2018</year><volume>10</volume><fpage>1178222618792860</fpage><pub-id pub-id-type="doi">10.1177/1178222618792860</pub-id><pub-id pub-id-type="medline">30158822</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zirikly</surname><given-names>A</given-names> </name><name name-style="western"><surname>Resnik</surname><given-names>P</given-names> </name><name name-style="western"><surname>Uzuner</surname><given-names>&#x00D6;</given-names> </name><name name-style="western"><surname>Hollingshead</surname><given-names>K</given-names> </name></person-group><article-title>CLPsych 2019 shared task: predicting the degree of suicide risk in reddit posts</article-title><year>2019</year><conf-name>Proceedings of the Sixth Workshop on Computational Linguistics and Clinical Psychology</conf-name><conf-date>Jun 6, 2019</conf-date><conf-loc>Minneapolis, Minnesota, USA</conf-loc><fpage>24</fpage><lpage>33</lpage><pub-id pub-id-type="doi">10.18653/v1/W19-3003</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>De Choudhury</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kiciman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><name name-style="western"><surname>Coppersmith</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>M</given-names> </name></person-group><article-title>Discovering shifts to suicidal ideation from mental health content in social media</article-title><access-date>2026-03-16</access-date><conf-name>CHI&#x2019;16</conf-name><conf-date>May 7-12, 2016</conf-date><conf-loc>San Jose, CA, USA</conf-loc><fpage>2098</fpage><lpage>2110</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://dl.acm.org/doi/proceedings/10.1145/2858036">https://dl.acm.org/doi/proceedings/10.1145/2858036</ext-link></comment><pub-id pub-id-type="doi">10.1145/2858036.2858207</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chancellor</surname><given-names>S</given-names> </name><name name-style="western"><surname>De Choudhury</surname><given-names>M</given-names> </name></person-group><article-title>Methods in predictive techniques for mental health status on social media: a critical review</article-title><source>NPJ Digit Med</source><year>2020</year><volume>3</volume><issue>1</issue><fpage>43</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-0233-7</pub-id><pub-id pub-id-type="medline">32219184</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Harrigian</surname><given-names>K</given-names> </name><name name-style="western"><surname>Aguirre</surname><given-names>C</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name></person-group><article-title>On the state of social media data for mental health research</article-title><year>2021</year><access-date>2026-03-16</access-date><conf-name>Proceedings of the Seventh Workshop on Computational Linguistics and Clinical Psychology</conf-name><conf-date>Jun 11, 2021</conf-date><conf-loc>Online</conf-loc><fpage>15</fpage><lpage>24</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.aclweb.org/anthology/2021.clpsych-1">https://www.aclweb.org/anthology/2021.clpsych-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2021.clpsych-1.2</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ram</surname><given-names>N</given-names> </name><name name-style="western"><surname>Haber</surname><given-names>N</given-names> </name><name name-style="western"><surname>Robinson</surname><given-names>TN</given-names> </name><name name-style="western"><surname>Reeves</surname><given-names>B</given-names> </name></person-group><article-title>Binding the person-specific approach to modern AI in the human screenome project: moving past generalizability to transferability</article-title><source>Multivariate Behav Res</source><year>2024</year><volume>59</volume><issue>6</issue><fpage>1211</fpage><lpage>1219</lpage><pub-id pub-id-type="doi">10.1080/00273171.2023.2229305</pub-id><pub-id pub-id-type="medline">37439508</pub-id></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacobson</surname><given-names>NC</given-names> </name><name name-style="western"><surname>Chung</surname><given-names>YJ</given-names> </name></person-group><article-title>Passive sensing of prediction of moment-to-moment depressed mood among undergraduates with clinical levels of depression sample using smartphones</article-title><source>Sensors (Basel)</source><year>2020</year><month>06</month><day>24</day><volume>20</volume><issue>12</issue><fpage>3572</fpage><pub-id pub-id-type="doi">10.3390/s20123572</pub-id><pub-id pub-id-type="medline">32599801</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wright</surname><given-names>AGC</given-names> </name><name name-style="western"><surname>Woods</surname><given-names>WC</given-names> </name></person-group><article-title>Personalized models of psychopathology</article-title><source>Annu Rev Clin Psychol</source><year>2020</year><month>05</month><day>7</day><volume>16</volume><issue>1</issue><fpage>49</fpage><lpage>74</lpage><pub-id pub-id-type="doi">10.1146/annurev-clinpsy-102419-125032</pub-id><pub-id pub-id-type="medline">32070120</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coppersmith</surname><given-names>DD</given-names> </name><name name-style="western"><surname>Bentley</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Kleiman</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Jaroszewski</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Daniel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Nock</surname><given-names>MK</given-names> </name></person-group><article-title>Automated real-time tool for promoting crisis resource use for suicide risk (ResourceBot): development and usability study</article-title><source>JMIR Ment Health</source><year>2024</year><month>10</month><day>31</day><volume>11</volume><fpage>e58409</fpage><pub-id pub-id-type="doi">10.2196/58409</pub-id><pub-id pub-id-type="medline">39481100</pub-id></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>La Sala</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sabo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Michail</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Online safety when considering self-harm and suicide-related content: qualitative focus group study with young people, policy makers, and social media industry professionals</article-title><source>J Med Internet Res</source><year>2025</year><month>03</month><day>10</day><volume>27</volume><fpage>e66321</fpage><pub-id pub-id-type="doi">10.2196/66321</pub-id><pub-id pub-id-type="medline">40063940</pub-id></nlm-citation></ref><ref id="ref70"><label>70</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adler</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Stamatis</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Meyerhoff</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Measuring algorithmic bias to analyze the reliability of AI tools that predict depression risk using smartphone sensed-behavioral data</article-title><source>Npj Ment Health Res</source><year>2024</year><month>04</month><day>22</day><volume>3</volume><issue>1</issue><fpage>17</fpage><pub-id pub-id-type="doi">10.1038/s44184-024-00057-y</pub-id><pub-id pub-id-type="medline">38649446</pub-id></nlm-citation></ref><ref id="ref71"><label>71</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacobucci</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ammerman</surname><given-names>BA</given-names> </name><name name-style="western"><surname>McClure</surname><given-names>K</given-names> </name></person-group><article-title>Examining missingness at the momentary level in clinical research using ecological momentary assessment: implications for suicide research</article-title><source>J Clin Psychol</source><year>2024</year><month>10</month><volume>80</volume><issue>10</issue><fpage>2147</fpage><lpage>2162</lpage><pub-id pub-id-type="doi">10.1002/jclp.23728</pub-id><pub-id pub-id-type="medline">38943339</pub-id></nlm-citation></ref><ref id="ref72"><label>72</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nock</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Borges</surname><given-names>G</given-names> </name><name name-style="western"><surname>Bromet</surname><given-names>EJ</given-names> </name><etal/></person-group><article-title>Cross-national prevalence and risk factors for suicidal ideation, plans and attempts</article-title><source>Br J Psychiatry</source><year>2008</year><month>02</month><volume>192</volume><issue>2</issue><fpage>98</fpage><lpage>105</lpage><pub-id pub-id-type="doi">10.1192/bjp.bp.107.040113</pub-id><pub-id pub-id-type="medline">18245022</pub-id></nlm-citation></ref><ref id="ref73"><label>73</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bentley</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Ball</surname><given-names>MI</given-names> </name><name name-style="western"><surname>Bose</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Pilot microrandomized trial of a brief digital intervention for suicidal thoughts</article-title><source>J Consult Clin Psychol</source><year>2025</year><month>10</month><volume>93</volume><issue>10</issue><fpage>690</fpage><lpage>704</lpage><pub-id pub-id-type="doi">10.1037/ccp0000978</pub-id><pub-id pub-id-type="medline">41129369</pub-id></nlm-citation></ref><ref id="ref74"><label>74</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scholten</surname><given-names>H</given-names> </name><name name-style="western"><surname>Granic</surname><given-names>I</given-names> </name></person-group><article-title>Use of the principles of design thinking to address limitations of digital mental health interventions for youth: viewpoint</article-title><source>J Med Internet Res</source><year>2019</year><month>01</month><day>14</day><volume>21</volume><issue>1</issue><fpage>e11528</fpage><pub-id pub-id-type="doi">10.2196/11528</pub-id><pub-id pub-id-type="medline">31344671</pub-id></nlm-citation></ref><ref id="ref75"><label>75</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mohr</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Lyon</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Lattie</surname><given-names>EG</given-names> </name><name name-style="western"><surname>Reddy</surname><given-names>M</given-names> </name><name name-style="western"><surname>Schueller</surname><given-names>SM</given-names> </name></person-group><article-title>Accelerating digital mental health research from early design and creation to successful implementation and sustainment</article-title><source>J Med Internet Res</source><year>2017</year><month>05</month><day>10</day><volume>19</volume><issue>5</issue><fpage>e153</fpage><pub-id pub-id-type="doi">10.2196/jmir.7725</pub-id><pub-id pub-id-type="medline">28490417</pub-id></nlm-citation></ref><ref id="ref76"><label>76</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nahum-Shani</surname><given-names>I</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>SN</given-names> </name><name name-style="western"><surname>Spring</surname><given-names>BJ</given-names> </name><etal/></person-group><article-title>Just-in-time adaptive interventions (JITAIs) in mobile health: key components and design principles for ongoing health behavior support</article-title><source>Ann Behav Med</source><year>2018</year><month>05</month><day>18</day><volume>52</volume><issue>6</issue><fpage>446</fpage><lpage>462</lpage><pub-id pub-id-type="doi">10.1007/s12160-016-9830-8</pub-id><pub-id pub-id-type="medline">27663578</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Model prompts used for zero-shot suicide risk scoring (Llama 3.2 11B Vision-Instruct and Qwen3-30B-A3B), vision-language model interpretation of high-risk screenshots (Qwen-VL), and synthetic smartphone mockup generation (DALL-E 3).</p><media xlink:href="mental_v13i1e90581_app1.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material></app-group></back></article>