<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Ment Health</journal-id><journal-id journal-id-type="publisher-id">mental</journal-id><journal-id journal-id-type="index">16</journal-id><journal-title>JMIR Mental Health</journal-title><abbrev-journal-title>JMIR Ment Health</abbrev-journal-title><issn pub-type="epub">2368-7959</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v13i1e87586</article-id><article-id pub-id-type="doi">10.2196/87586</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Detection of Self-Harm in Electronic Mental Health Records Using Privacy-Preserving Local Language Models: Methodological Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Kormilitzin</surname><given-names>Andrey</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Joyce</surname><given-names>Dan W</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tsiachristas</surname><given-names>Apostolos</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Borschmann</surname><given-names>Rohan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kapur</surname><given-names>Navneet</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="aff" rid="aff10">10</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Geulayov</surname><given-names>Galit</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Psychiatry, University of Oxford</institution><addr-line>Warneford Hospital, Warneford Lane</addr-line><addr-line>Oxford</addr-line><addr-line>England</addr-line><country>United Kingdom</country></aff><aff id="aff2"><institution>NIHR Oxford Health Biomedical Research Centre</institution><addr-line>Oxford</addr-line><country>United Kingdom</country></aff><aff id="aff3"><institution>Institute of Population Health, University of Liverpool</institution><addr-line>Liverpool</addr-line><addr-line>England</addr-line><country>United Kingdom</country></aff><aff id="aff4"><institution>Mental Health Research for Innovation Centre, University of Liverpool</institution><addr-line>Liverpool</addr-line><addr-line>England</addr-line><country>United Kingdom</country></aff><aff id="aff5"><institution>Mersey Care NHS Foundation Trust</institution><addr-line>Liverpool</addr-line><addr-line>England</addr-line><country>United Kingdom</country></aff><aff id="aff6"><institution>Nuffield Department of Primary Care Health Sciences, University of Oxford</institution><addr-line>Oxford</addr-line><addr-line>England</addr-line><country>United Kingdom</country></aff><aff id="aff7"><institution>Health Service and Population Research Department, Institute of Psychiatry, Psychology &#x0026; Neuroscience, King&#x2019;s College London</institution><addr-line>London</addr-line><country>United Kingdom</country></aff><aff id="aff8"><institution>Justice Health Group, School of Population Health, Curtin University</institution><addr-line>Perth</addr-line><addr-line>Western Australia</addr-line><country>Australia</country></aff><aff id="aff9"><institution>Melbourne School of Psychological Sciences, The University of Melbourne</institution><addr-line>Melbourne</addr-line><addr-line>Victoria</addr-line><country>Australia</country></aff><aff id="aff10"><institution>Division of Psychology and Mental Health and NIHR Greater Manchester Patient Safety Research Collaboration, University of Manchester</institution><addr-line>Manchester</addr-line><addr-line>England</addr-line><country>United Kingdom</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Torous</surname><given-names>John</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Choi</surname><given-names>Chang Min</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mohanadas</surname><given-names>Sadhasivam</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Galit Geulayov, PhD, Department of Psychiatry, University of Oxford, Warneford Hospital, Warneford Lane, Oxford, England, OX3 7JX, United Kingdom, 44 01865305337; <email>galit.geulayov@psych.ox.ac.uk</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>2</day><month>6</month><year>2026</year></pub-date><volume>13</volume><elocation-id>e87586</elocation-id><history><date date-type="received"><day>12</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>08</day><month>03</month><year>2026</year></date><date date-type="accepted"><day>09</day><month>03</month><year>2026</year></date></history><copyright-statement>&#x00A9; Andrey Kormilitzin, Dan W Joyce, Apostolos Tsiachristas, Rohan Borschmann, Navneet Kapur, Galit Geulayov. Originally published in JMIR Mental Health (<ext-link ext-link-type="uri" xlink:href="https://mental.jmir.org">https://mental.jmir.org</ext-link>), 2.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Mental Health, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mental.jmir.org/">https://mental.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mental.jmir.org/2026/1/e87586"/><abstract><sec><title>Background</title><p>Self-harm is the strongest risk factor for suicide and an important outcome for mental health care. Although prevalent in clinical populations, it is often imprecisely captured in routinely collected clinical data, where it is often recorded and stored as unstructured free text. Contemporary language models, such as GPT (OpenAI) and Gemini (Google), can analyze free-text clinical notes, but such models may violate data governance of processing sensitive patient data.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate whether a privacy-preserving language model running entirely within an institution&#x2019;s secure computing infrastructure (here, the UK National Health Service [NHS]) could accurately identify the presence and timing of self-harm using electronic health records from secondary mental health care.</p></sec><sec sec-type="methods"><title>Methods</title><p>Clinical notes were drawn from Oxford Health NHS Foundation Trust using a multistage workflow: (1) a random sample of 1000 patients with a psychiatric diagnosis, defined according to the <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>; codes F00&#x2013;F99); (2) candidate-note identification using a Gemma3-4b language model to flag notes containing self-harm content; and (3) from those candidates, 1352 randomly sampled notes were selected for expert annotation, resulting in gold-standard corpus enriched for self-harm content. Clinical notes were annotated for the presence of self-harm and its timing (&#x2264;90 days, &#x003E;90 days, or unknown). A privacy-preserving locally served 27-billion-parameter Gemma 3 language model (&#x201C;Gemma3-27b&#x201D;) was used as the core model. Prompts were systematically developed and refined using a labeled development set to identify self-harm and generate a structured output per clinical record. Gemma3-27b performance was compared against a strong baseline multilabel text classification model based on robustly optimized BERT pretraining approach (RoBERTa), a transformer-based language model architecture. Model performance was evaluated using precision, recall, and the <italic>F</italic><sub>1</sub>-score (harmonic mean of precision and recall), with 95% CIs estimated from 1000 bootstrap samples with replacement.</p></sec><sec sec-type="results"><title>Results</title><p>Gemma3-27b outperformed the RoBERTa classifier across all categories, achieving Precision=0.92, Recall=0.92 (sensitivity), and <italic>F</italic><sub>1</sub>-score=0.92 for notes containing self-harm, and Precision=0.97, Recall=0.97 (specificity), and <italic>F</italic><sub>1</sub>-score=0.97 for notes without self-harm. For the 51 notes labeled as recent self-harm in the held-out test set, Gemma3-27b achieved Precision=0.84, Recall=0.75, and <italic>F</italic><sub>1</sub>-score=0.79. The global weighted <italic>F</italic><sub>1</sub>-score of Gemma3-27b across all categories was 0.88, compared to 0.85 for RoBERTa.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>With systematic prompt development on a labeled development set, but no gradient-based fine-tuning, the current Gemma3-27b language model matched or exceeded a fine-tuned RoBERTa classifier for ascertaining self-harm events and their timing. Aggregate gains were modest, while improvements were largest in the most challenging, lower-frequency timing categories. On a simplified binary recent-versus-other task, RoBERTa performed marginally better, indicating that supervised classifiers remain highly effective when the task is simplified and sufficient labeled data exist. This work demonstrates the technical feasibility of privacy-preserving self-harm detection within a secure NHS research environment.</p></sec></abstract><kwd-group><kwd>self-harm</kwd><kwd>electronic health records</kwd><kwd>large language models</kwd><kwd>privacy</kwd><kwd>Gemma3</kwd><kwd>Ollama</kwd><kwd>temporal information extraction</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Self-harm (intentional self-poisoning or self-injury, irrespective of motivation [<xref ref-type="bibr" rid="ref1">1</xref>]) represents a major public health challenge. In England, approximately 5000 individuals die by suicide each year [<xref ref-type="bibr" rid="ref2">2</xref>] and more than 200,000 individuals present to general hospitals due to self-harm [<xref ref-type="bibr" rid="ref3">3</xref>]. Many more self-harm without seeking treatment [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>Self-harm is the strongest risk factor for suicide [<xref ref-type="bibr" rid="ref5">5</xref>]. Individuals who present to clinical services following self-harm are over 100 times more likely to die by suicide compared to those who do not self-harm [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. Their risk of accidental death and death by natural causes is also markedly elevated [<xref ref-type="bibr" rid="ref5">5</xref>]. Furthermore, these individuals have a higher risk of further nonfatal self-harm and adverse psychosocial outcomes [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>Despite it being an important outcome for mental health care, information about self-harm is often imprecisely captured in many health care settings. For example, in one study from England [<xref ref-type="bibr" rid="ref10">10</xref>], the investigators compared research-derived rates of hospital-presenting self-harm to official hospital episode statistics [<xref ref-type="bibr" rid="ref11">11</xref>] data. The study found substantial under-ascertainment in official statistics compared with the research-derived figures, even though both sources drew on the same underlying clinical information. Accurate and systematic identification of self-harm across settings is essential for conducting valid and reliable research and for planning and delivering effective intervention strategies.</p><p>Suicide and self-harm research involves numerous methodological challenges. It can be resource-intensive and costly, with data collected and collated over many years from some (but not all) relevant settings. Consequently, many instances of self-harm go undetected, leading to missed opportunities for intervention and compromised research. Leveraging existing data collected as part of routine patient care can provide a valuable, contemporaneous, and economical source of information. Such data, which contain a wealth of information, have been used to study many health conditions, for example, cardiovascular disease, diabetes, and osteoarthritis [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. However, using such rich information comes with significant challenges, particularly due to the large volume of data, much of which is often collected and stored in an unstructured narrative format. Advances in artificial intelligence and natural language processing (NLP) present an opportunity to unlock, retrieve, and convert this information into a format accessible for research and clinical care.</p><p>Previously, investigators have used the Clinical Record Interactive Search (CRIS) database of the South London and Maudsley National Health Service (NHS) Foundation Trust to identify suicidal ideation and self-harm from free text in secondary mental health electronic health records (EHRs) [<xref ref-type="bibr" rid="ref14">14</xref>]. Such models show good performance in identifying patients who have self-harmed. Identifying the timing of the self-harm through free text, however, has been more challenging [<xref ref-type="bibr" rid="ref15">15</xref>]. The timing of self-harm is important for both research and clinical practice. Evaluating the effect of interventions or routine care depends on accurately establishing the timing of self-harm. Similarly, reliable longitudinal analysis relies on ascertaining the temporal sequence of self-harm alongside its covariates. Importantly, the risk of suicide and repeat self-harm is acutely elevated soon after a self-harm episode [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. As such, accurately capturing the timing of self-harm episodes is critical for identifying individuals in need of timely interventions and risk reduction strategies.</p></sec><sec id="s1-2"><title>Machine Learning for Self-Harm Identification</title><p>Well-established machine learning models for typical NLP tasks, such as named-entity recognition, relationship extraction, text classification tasks, and negation detection, have shown good ability to identify and structure the concepts of interest [<xref ref-type="bibr" rid="ref17">17</xref>]. However, training such models relies on a large amount of data, manually annotated by experts. Collecting a sufficient amount of high-quality annotated data can be an expensive and time-consuming task. Since the introduction of large language models (LLMs) and, in particular, GPT and their chatbot interface, such as ChatGPT, the information extraction field has seen a paradigm shift. Multiple studies have shown that generic LLMs (eg, GPT, Claude, and Gemini) trained on a massive corpus from the internet can identify concepts of interest and generate a structured output following the prompt tailored for each particular task [<xref ref-type="bibr" rid="ref18">18</xref>]. For instance, LLMs have been successfully applied to extract complex relationships between biomedical entities from the scientific literature by carefully prompting the model with a description of the desired relationship [<xref ref-type="bibr" rid="ref19">19</xref>]. Furthermore, LLMs have demonstrated strong performance in extracting events and their context from news articles, outperforming traditional supervised models in some cases, especially in low-resource settings [<xref ref-type="bibr" rid="ref20">20</xref>]. Additionally, recent work has explored the use of LLMs for extracting information from noisy and ambiguous user-generated content like social media posts, showing promising results in identifying relevant entities and topics despite the informal language and varied writing styles [<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>However, the use of proprietary LLM services via their application programming interface (API), such as those provided by OpenAI (GPT), Anthropic (Claude), and Google (Gemini), poses significant challenges to patient data privacy and may not be compliant with clinical information governance. In contrast, if an LLM can be implemented within a health care institution&#x2019;s own secure clinical data environment, there is no need to risk exposing sensitive and confidential data via APIs to proprietary services. Until recently, implementing LLMs (including training and inference) has been implausible because of their memory and computing costs. With the introduction of quantized LLMs, models that have been made smaller and more computationally efficient by storing the numerical values of their parameters in a simpler form, open-weight models such as Gemma3-27b can be hosted and used for inference on modest compute resources with performance (for specific tasks) only marginally lower than the original (not quantized) model. Therefore, researchers have explored the use of these local versions of LLMs, such as Llama [<xref ref-type="bibr" rid="ref22">22</xref>] for information extraction from clinical records [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>], as well as for identifying acts of suicidality [<xref ref-type="bibr" rid="ref25">25</xref>].</p></sec><sec id="s1-3"><title>Motivation and Our Contribution</title><p>In this study, we evaluated whether privacy-preserving local language models can identify self-harm and its timing in secondary mental health records, converting unstructured clinical notes into structured data. We assessed the semantic reasoning capabilities of a pretrained language model to distinguish self-harm events from related concepts (eg, ideation and risk assessments) and classify their timing. Specifically, we tested whether an open-weight Gemma3 model with 27 billion parameters, deployed locally, can accurately detect self-harm and identify its timing without gradient-based fine-tuning (ie, without updating the model&#x2019;s internal parameters on our data), relying instead on systematic prompt development using a labeled development set.</p><p>We compared the current approach against a supervised robustly optimized BERT pretraining approach (RoBERTa) classifier (a commonly used model) trained on identical data. Model performance was assessed using precision, recall, and the <italic>F</italic><sub>1</sub>-score (the harmonic mean of precision and recall) with 95% CIs estimated using 1000 bootstrap samples with replacement. We hypothesized that local language models would (1) match or exceed supervised model performance through prompt-based inference guided by a labeled development set, thereby reducing the volume of annotated data needed for gradient-based training; and (2) mitigate data governance barriers inherent in using cloud-based solutions via APIs, enabling deployment within health care institutions.</p><p>This work addresses the critical need for accurate self-harm identification in clinical records, using language models that can be deployed locally under strict patient confidentiality standards for sensitive mental health data and within constrained computational resources.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Definition of Self-Harm</title><p>Self-harm refers to any form of intentional self-poisoning or self-injury, irrespective of motivation [<xref ref-type="bibr" rid="ref1">1</xref>]. It can take many forms, including overdosing on medications, ingesting a non-ingestible substance, or inflicting injury upon oneself through actions like cutting. In clinical settings (eg, hospital emergency departments and mental health services), self-harm ascertainment relies on a clinician&#x2019;s judgment; ie, a clinician will determine whether the self-inflicted act was intentional, as opposed to accidental, even in the absence of patient confirmation [<xref ref-type="bibr" rid="ref26">26</xref>].</p></sec><sec id="s2-2"><title>Data Source and Ethics</title><p>Data for this study were sourced from the CRIS system by the Akrivia Health [<xref ref-type="bibr" rid="ref27">27</xref>] analytics platform on behalf of the Oxford Health NHS Foundation Trust, UK. Akrivia Health [<xref ref-type="bibr" rid="ref27">27</xref>] provides a secure research environment with a robust information governance framework compliant with national statutory regulations for health care data. The CRIS database comprises pseudonymized EHRs including free-text clinical notes as well as structured data fields from secondary care mental health services [<xref ref-type="bibr" rid="ref28">28</xref>]. Studies using this platform require approval from the health care institution that provided the data.</p></sec><sec id="s2-3"><title>Cohort Selection</title><p>The study population involved individuals aged 18 years or older with a confirmed psychiatric diagnosis (see Section A of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) according to the <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>), who were in contact with specialist secondary mental health care services. Clinical records of patients with primary diagnoses of <italic>ICD-10</italic> codes F00-F99: Mental and Behavioral Disorders (see Section A of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), recorded between March 1, 2016, and March 1, 2022 (inclusive), were randomly sampled for annotation, as described in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>The data flow diagram of the clinical notes used to develop the models.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mental_v13i1e87586_fig01.png"/></fig><p>In contrast to a common approach of identifying clinical notes that contain mentions of self-harm using a key-word fuzzy pattern matching (which can be biased in identifying cases beyond the predefined keywords), we opted for using a small, yet capable LM (&#x201C;Gemma3&#x201D;) with 4 billion parameters (&#x201C;Gemma3-4b&#x201D;) prompted to identify potential self-harm events minimizing the risk of the aforementioned key word-based bias (see Section B of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p><xref ref-type="fig" rid="figure1">Figure 1</xref> provides an overview of the end-to-end data selection process in this study. First, an initial cohort of 43,791 patients with over 5.2 million clinical notes satisfying the above criteria was extracted from the Oxford Health NHS CRIS database (Step 1). Given the limited human annotator resources, we randomly sampled 1000 patients (Step 2), followed by retrieving clinical notes containing potential mentions of self-harm using a lightweight &#x201C;Gemma3-4b&#x201D; LM (Step 3). However, the model identified over 11,000 notes containing self-harm, which significantly exceeded the capacity of our clinical annotators. As such, we further randomly reduced the cohort of 11,655 clinical notes containing self-harm to 1352 clinical notes from 628 unique patients (Step 4).</p><p>Since we used the Gemma3-4b language model to initially identify clinical notes that might contain self-harm cases, the text we found was richer in this type of content than one would find across all patient records. To mitigate this selection bias, we further randomly sampled 1352 notes from 628 different patients for expert review. While our annotated dataset likely contains more self-harm language than a typical sample would, the random selection process made sure we weren&#x2019;t inadvertently favoring particular patients, specific time frames, or certain diagnostic groups.</p><p>This enrichment improves annotation efficiency by increasing the proportion of positive examples available for model development, but it changes the class distribution relative to routine care. In particular, model performance metrics, for example, positive predictive value may differ when applied to all clinical notes, where the prevalence of self-harm mentions is substantially lower. Implications for routine deployment are discussed in the Limitations section.</p></sec><sec id="s2-4"><title>An Annotation Schema and a Curated Dataset</title><p>Manual annotation of textual data is essential for developing and evaluating NLP models for information extraction. A systematic annotation process involving expert coders ensures unambiguous tagging of text segments according to a predefined schema, enabling models to learn meaningful patterns and providing a human benchmark for performance.</p><p>We used a multilabel annotation schema focusing on whether an actual act of self-harm occurred and whether it was (1) recent (occurring within 90 days of documentation), (2) historical (those occurring &#x003E;90 days prior to documentation), or (3) of unknown timing (the timing could not be determined). Specifically, if annotators identified a self-harm event, the corresponding text was labeled as &#x201C;Self-harm present&#x201D; along with a timing tag. Where it was not possible to unambiguously determine whether a self-harm event took place, we labeled these cases as &#x201C;unknown self-harm.&#x201D; Clinical notes that did not include self-harm events (eg, a clinical note describing a patient with psychotic symptoms with no mentions of self-harm or one that mentions risk of self-harm but not actual self-harm) were unlabeled. The annotation protocol included initial training and calibration sessions, detailed guidelines with clinical examples, and regular interrater reliability assessments. Disagreements were jointly reviewed to refine decision boundaries and for consistency. The schema thus comprises 5 distinct labels as shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Five categories to designate the self-harm events used in the study.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Label</td><td align="left" valign="bottom">Explanation</td></tr></thead><tbody><tr><td align="left" valign="top">Self-harm absent</td><td align="left" valign="top">Statement negating self-harm or where self-harm was not mentioned at all (eg, a clinical note recording only psychotic symptoms).</td></tr><tr><td align="left" valign="top">Self-harm present</td><td align="left" valign="top">Explicit description of a self-harm (eg, self-poisoning or self-injury) event by the patient.</td></tr><tr><td align="left" valign="top">Recent</td><td align="left" valign="top">Event occurred within the last 90 days prior to the note date.</td></tr><tr><td align="left" valign="top">Historical</td><td align="left" valign="top">Event occurred more than 90 days prior to the note date.</td></tr><tr><td align="left" valign="top">Unknown timing</td><td align="left" valign="top">Where time cannot be determined from the information provided (eg, &#x201C;the patient has self-harmed previously&#x201D; or &#x201C;Deliberate self-harm scars and burns evident&#x201D;).</td></tr></tbody></table></table-wrap><p>All forms of intentional self-inflicted harm (including suicide attempts and self-harm where the specific motivation was not explicitly mentioned) were in scope; self-harm ideation (eg, &#x201C;patient feels like cutting,&#x201D; and &#x201C;patient wishes to end it all&#x201D;) were excluded unless a self-harm act was also mentioned.</p><p>Many patients had a long-documented history of contact with secondary mental health services. Annotators were instructed to treat each clinical note extract as a standalone document, independent of any decisions made about previous extracts from the same patient.</p></sec><sec id="s2-5"><title>Partitioning of Data for Training and Validation</title><p>The annotated sample of 1352 examples was split at the patient level, to avoid data leakage, into the training (80%, n=1084) and test (20%, n=268) sets, respectively. Resulting class distribution within the training and test splits is summarized in <xref ref-type="fig" rid="figure2">Figure 2</xref>. In most instances, when self-harm was mentioned by a clinician, there was sufficient information to determine whether this was a recent or past event.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The distribution of 1352 clinical notes in the gold standard corpus according to self-harm status and timing labels, by training (80%) and testing (20%) sets.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mental_v13i1e87586_fig02.png"/></fig></sec><sec id="s2-6"><title>Annotation Procedure</title><p>In order to evaluate the developed annotation schema and the degree of agreement, 2 data annotators, including 1 researcher (LB) with over 30 years&#x2019; experience collecting and coding self-harm information from hospital records in the United Kingdom, and another (GG) with more than 15 years&#x2019; experience researching self-harm and suicidal behavior, refined category boundaries and created a decision flowchart. They subsequently independently further labeled 1352 randomly sampled notes.</p></sec><sec id="s2-7"><title>Comparing the Agreement on Annotations</title><p>Interannotator agreement was calculated on a sample of 160 notes annotated by two raters independently, using a Cohen &#x03BA; hierarchical approach. Each clinical note was annotated along two categories: (1) self-harm status (&#x201C;present&#x201D; or &#x201C;absent&#x201D;) and (2) timing of self-harm (&#x201C;recent,&#x201D; &#x201C;historical,&#x201D; or &#x201C;unknown&#x201D;). The timing labels were sought only if self-harm was &#x201C;present.&#x201D; As the second decision category (ie, timing of self-harm) was conditional on the first category, we opted to report interannotator agreement using a hierarchical protocol: (1) assess agreement on self-harm status (present or absent); (2) assess agreement on the timing of self-harm event (&#x201C;recent,&#x201D; &#x201C;historical,&#x201D; or &#x201C;unknown&#x201D;) within the subset of notes where both annotators agreed that self-harm was present.</p><p>For category one&#x2014;self-harm present or absent&#x2014;each note contributed to a 2-by-2 contingency table and the resulting <italic>&#x03BA;<sub>self-harm</sub></italic> captured chance-corrected concordance on event detection. Using the subset of notes where both coders marked self-harm present, each annotator assigned 1 of 3 nominal categories: recent, historical, or unknown, yielding a 3-by-3 table from which an unweighted <italic>&#x03BA;<sub>recency</sub></italic> was computed. The 3-by-3 structure reflects the 3 possible timing categories assigned independently by each of the 2 annotators. Uncertainty estimates for both <italic>&#x03BA;</italic> coefficients were obtained via 1000-fold stratified bootstrap resampling with replacement from the 160 jointly annotated notes. At each iteration, the set of notes was resampled while preserving the marginal class distribution (ie, the proportion of &#x201C;present&#x201D; vs &#x201C;absent&#x201D; for self-harm status, and the relative frequencies of &#x201C;recent,&#x201D; &#x201C;historical,&#x201D; and &#x201C;unknown&#x201D; within the subset marked as self-harm present). For each bootstrap replicate, <italic>&#x03BA;</italic> was recalculated, and the 2.5th and 97.5th percentiles of the resulting empirical distribution were used to form 95% CIs. This stratification ensured that class imbalance did not distort the variability estimates of <italic>&#x03BA;</italic>.</p></sec><sec id="s2-8"><title>Model Development</title></sec><sec id="s2-9"><title>Core Language Model</title><p>While language models have demonstrated strong capabilities in clinical text processing and reasoning (Wei et al [<xref ref-type="bibr" rid="ref29">29</xref>] and Huang et al [<xref ref-type="bibr" rid="ref30">30</xref>]), their computational demands can be prohibitive [<xref ref-type="bibr" rid="ref31">31</xref>]. Therefore, we explored the Gemma3-27b, a decoder-only transformer model with 27 billion parameters and a 128K token context window [<xref ref-type="bibr" rid="ref32">32</xref>]. The model is based on a novel architecture with a 5:1 ratio of local to global attention layers, where local layers use sliding window attention over 1024 tokens to reduce memory consumption during inference. The model was quantized to 4-bit precision (Q4_K_M format), resulting in a 10.6 GB model size and served locally within a secure environment at the Oxford Health NHS Foundation Trust using the Ollama framework (v0.9.6) with llama.cpp backend [<xref ref-type="bibr" rid="ref33">33</xref>]. All experiments were conducted on the Microsoft Azure T4 instance (&#x201C;Standard_NC8as_T4_v3&#x201D;) with 16GB graphical processing unit (GPU) memory as a cost-effective solution for the NHS for information extraction and reasoning tasks.</p></sec><sec id="s2-10"><title>Baseline Multilabel Text Classification</title><p>To evaluate the benefit of language models for semantic reasoning on self-harm, we trained and evaluated a transformers-based text classification model as a baseline (a benchmark model). We chose the RoBERTa model for its competing overall performance and speed. The model was fine-tuned using a binary cross-entropy with logits loss, natively implemented in PyTorch [<xref ref-type="bibr" rid="ref34">34</xref>]. The reason for opting for a multilabel text classification model was to mimic the behavior of a language model, whereby it outputs simultaneously both self-harm status and timing labels. For consistency, the RoBERTa model was trained and evaluated using the same training and test data splits used to develop the Gemma3 model. For reproducibility and detailed training, see Section B of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-11"><title>Prompt Engineering</title><p>The effective use of language models for information extraction relies heavily on well-designed prompts that provide clear instructions and context for the task at hand. In this study, we focused on specifying rules and contextual cues to identify self-harm and determine its timing. Furthermore, our prompt design addressed the challenge of distinguishing actual self-harm events from related concepts such as suicidal ideation and self-harm risk. The prompt development followed established prompt engineering principles, eg, chain-of-thought and panel-of-experts approaches [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>], using task decomposition [<xref ref-type="bibr" rid="ref29">29</xref>] to address two sequential classification tasks: (1) binary detection of self-harm presence, and (2) temporal classification into 3 categories.</p><p>The prompt design incorporated clear inclusion criteria for completed intentional acts, comprehensive exclusion criteria (eg, thoughts, plans, and threats), and specific guidance for ambiguous cases. The final prompt required JSON-formatted output with direct textual evidence for each classification, using a 90-day threshold for determining a recent episode of self-harm. The prompt was validated by 2 self-harm experts before deployment.</p><p>To mitigate the risk of prompt overfitting and to meaningfully compare to a baseline RoBERTa model, the models were trained and tested on the same split partitions (<xref ref-type="fig" rid="figure2">Figure 2</xref>). For the Gemma3-27b iterative prompt refinement procedure, the training dataset was further split into development (n=542) and validation (n=542) datasets. The optimal prompt was developed iteratively on the development set (n=542), and after satisfactory performance was achieved on the validation set (n=542), the model was finally evaluated on the held-out test set (n=268).</p><p>For RoBERTa, we trained two variants: RoBERTa (n=542) using only the development set for fair comparison with Gemma3-27b&#x2019;s prompt refinement data exposure, and RoBERTa (n=1084) using the full training set. Both models were evaluated on the identical held-out test set, ensuring unbiased performance comparison. The data flow through our experimental pipeline is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The prompt engineering approach with all development details and the prompt used in this work is presented in Section C of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-12"><title>Ethical Considerations</title><p>This study received approval from the Oxford Health NHS Foundation Trust CRIS Oversight Committee and data were processed in accordance with the procedure outlined by the Oxford Health NHS Foundation Trust [<xref ref-type="bibr" rid="ref37">37</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Annotation Consistency</title><p>The results of inter-annotator agreement are shown in <xref ref-type="table" rid="table2">Table 2</xref>. These include CIs calculated using 1000 bootstrap resamples on a random set of 160 notes annotated independently by 2 human experts (GG and LB). The agreement was very good for self-harm status (presence) and was good for timing on self-harm-present notes [<xref ref-type="bibr" rid="ref38">38</xref>], indicating robust and unambiguous annotation rules and the ability of 2 independent annotators (GG and LB) to follow it easily.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Inter-annotator agreement for identifying self-harm and its timing.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Decision layer</td><td align="left" valign="bottom">Cohen &#x03BA; (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Self-harm status: present vs absent</td><td align="left" valign="top"><italic>&#x03BA;<sub>self-harm</sub></italic>=0.86 (0.78-0.94)</td></tr><tr><td align="left" valign="top">Timing of self-harm: recent vs historical vs unknown</td><td align="left" valign="top"><italic>&#x03BA;<sub>recency</sub></italic> = 0.71 (0.55-0.83)</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>Multilabel Self-Harm Identification Models</title><p>Unless otherwise specified, we report <italic>F</italic><sub>1</sub>-score. Sensitivity, specificity, and recall metrics are reported explicitly. Although Gemma32-7b requires no task-specific fine-tuning, we nevertheless built 2 supervised baselines to benchmark its zero-shot prompt-
based extraction. The first, RoBERTa (n=542), was trained on the same 542-
note development split that guided prompt refinement, giving a like-for-like comparison in terms of labeled
 data &#x201C;
seen&#x201D;
 by each approach. To assess the models&#x2019; performance, we computed the point estimates for precision, recall, and <italic>F</italic><sub>1</sub>-score for each of the five categories: (1) at the self-harm status level: present and absent, and (2) at the timing level: recent self-harm episode, historical episode, and unknown timing. The performance metrics with corresponding 95% CIs are presented in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Comparisons of the three models&#x2019; performance according to self-harm status and its timing labels. Two baseline RoBERTa<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> models were trained on datasets with 542 and 1084 samples.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category (classification)</td><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Precision (95% CI)</td><td align="left" valign="bottom">Recall (95% CI)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Self-harm (absence)</td><td align="left" valign="top">RoBERTa (n=542)</td><td align="left" valign="top">0.89 (0.86-0.92)</td><td align="left" valign="top">0.93 (0.90-0.95)</td><td align="left" valign="top">0.91 (0.89-0.93)</td></tr><tr><td align="left" valign="top">Self-harm (absence)</td><td align="left" valign="top">RoBERTa (n=1084)</td><td align="left" valign="top">0.92 (0.89-0.96)</td><td align="left" valign="top">0.98 (0.96-0.99)</td><td align="left" valign="top">0.95 (0.93-0.97)</td></tr><tr><td align="left" valign="top">Self-harm (absence)</td><td align="left" valign="top">Gemma3-27b (n=542)</td><td align="left" valign="top">0.96 (0.93-0.99)</td><td align="left" valign="top">0.97 (0.95-0.99)</td><td align="left" valign="top">0.97 (0.95-0.98)</td></tr><tr><td align="left" valign="top">Self-harm (presence)</td><td align="left" valign="top">RoBERTa (n=542)</td><td align="left" valign="top">0.81 (0.74-0.87)</td><td align="left" valign="top">0.72 (0.65-0.79)</td><td align="left" valign="top">0.76 (0.71-0.82)</td></tr><tr><td align="left" valign="top">Self-harm (presence)</td><td align="left" valign="top">RoBERTa (n=1084)</td><td align="left" valign="top">0.94 (0.88-0.98)</td><td align="left" valign="top">0.81 (0.71-0.89)</td><td align="left" valign="top">0.87 (0.80-0.92)</td></tr><tr><td align="left" valign="top">Self-harm (presence)</td><td align="left" valign="top">Gemma3-27b (n=542)</td><td align="left" valign="top">0.92 (0.86-0.98)</td><td align="left" valign="top">0.91 (0.86-0.95)</td><td align="left" valign="top">0.92 (0.88-0.96)</td></tr><tr><td align="left" valign="top">Timing (historical)</td><td align="left" valign="top">RoBERTa (n=542)</td><td align="left" valign="top">0.87 (0.10-1)</td><td align="left" valign="top">0.07 (0-0.18)</td><td align="left" valign="top">0.12 (0-0.31)</td></tr><tr><td align="left" valign="top">Timing (historical)</td><td align="left" valign="top">RoBERTa (n=1084)</td><td align="left" valign="top">0.65 (0.10-1)</td><td align="left" valign="top">0.11 (0-0.29)</td><td align="left" valign="top">0.18 (0-0.43)</td></tr><tr><td align="left" valign="top">Timing (historical)</td><td align="left" valign="top">Gemma3-27b (n=542)</td><td align="left" valign="top">0.47 (0.25-0.68)</td><td align="left" valign="top">0.53 (0.28-0.77)</td><td align="left" valign="top">0.51 (0.27-0.68)</td></tr><tr><td align="left" valign="top">Timing (recent)</td><td align="left" valign="top">RoBERTa (n=542)</td><td align="left" valign="top">0.52 (0.44-0.61)</td><td align="left" valign="top">0.75 (0.66-0.84)</td><td align="left" valign="top">0.62 (0.54-0.69)</td></tr><tr><td align="left" valign="top">Timing (recent)</td><td align="left" valign="top">RoBERTa (n=1084)</td><td align="left" valign="top">0.67 (0.56-0.79)</td><td align="left" valign="top">0.86 (0.76-0.94)</td><td align="left" valign="top">0.75 (0.65-0.84)</td></tr><tr><td align="left" valign="top">Timing (recent)</td><td align="left" valign="top">Gemma3-27b (n=542)</td><td align="left" valign="top">0.84 (0.79-0.89)</td><td align="left" valign="top">0.75 (0.59-0.90)</td><td align="left" valign="top">0.79 (0.65-0.89)</td></tr><tr><td align="left" valign="top">Timing (unknown)</td><td align="left" valign="top">RoBERTa (n=542)</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Timing (unknown)</td><td align="left" valign="top">RoBERTa (n=1084)</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Timing (unknown)</td><td align="left" valign="top">Gemma3-27b (n=542)</td><td align="left" valign="top">0.32 (0.07-0.62)</td><td align="left" valign="top">0.44 (0.11-0.68)</td><td align="left" valign="top">0.39 (0.09-0.61)</td></tr><tr><td align="left" valign="top">Average (micro)</td><td align="left" valign="top">RoBERTa (n=542)</td><td align="left" valign="top">0.80 (0.76-0.83)</td><td align="left" valign="top">0.78 (0.74-0.82)</td><td align="left" valign="top">0.79 (0.75-0.82)</td></tr><tr><td align="left" valign="top">Average (micro)</td><td align="left" valign="top">RoBERTa (n=1084)</td><td align="left" valign="top">0.85 (0.79-0.90)</td><td align="left" valign="top">0.84 (0.79-0.89)</td><td align="left" valign="top">0.83 (0.78-0.88)</td></tr><tr><td align="left" valign="top">Average (micro)</td><td align="left" valign="top">Gemma3-27b (n=542)</td><td align="left" valign="top">0.88 (0.84-0.91)</td><td align="left" valign="top">0.87 (0.84-0.91)</td><td align="left" valign="top">0.88 (0.84-0.91)</td></tr><tr><td align="left" valign="top">Average (weighted)</td><td align="left" valign="top">RoBERTa (n=542)</td><td align="left" valign="top">0.78 (0.72-0.82)</td><td align="left" valign="top">0.78 (0.72-0.82)</td><td align="left" valign="top">0.76 (0.72-0.80)</td></tr><tr><td align="left" valign="top">Average (weighted)</td><td align="left" valign="top">RoBERTa (n=1084)</td><td align="left" valign="top">0.85 (0.79-0.90)</td><td align="left" valign="top">0.84 (0.79-0.89)</td><td align="left" valign="top">0.83 (0.78-0.88)</td></tr><tr><td align="left" valign="top">Average (weighted)</td><td align="left" valign="top">Gemma3-27b (n=542)</td><td align="left" valign="top">0.88 (0.84-0.92)</td><td align="left" valign="top">0.87 (0.84-0.91)</td><td align="left" valign="top">0.88 (0.84-0.92)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>RoBERTa: robustly optimized BERT pretraining approach.</p></fn></table-wrap-foot></table-wrap><p><xref ref-type="table" rid="table3">Table 3</xref> shows that RoBERTa (n=542) underperformed Gemma3-27b across all labels, most notably on recent self-harm (<italic>F</italic><sub>1</sub>=0.62 vs 0.79) and on the low-prevalence historical class, where recall collapsed to only 7% versus 53% that of Gemma3-27b. The multilabel confusion matrices for all categories are shown in Figure S1 in Section D of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>We assumed that weak performance of RoBERTa could be attributed to limited training examples rather than intrinsic model capacity. Accordingly, we trained a second baseline RoBERTa (n=1084) on the full-1084
note training set. Performance improved, but even with nearly double the annotations RoBERTa still failed to surpass Gemma&#x2019;s precision-recall balance, demonstrating that even relatively small (27

 billion
 parameters) privacy-preserving language model offers a stronger starting point than a task-specific transformer, even when the latter is given all available data.</p><p>The Gemma3-27b model demonstrated superior performance across all classification categories, achieving a weighted <italic>F</italic><sub>1</sub>-score of 0.88 and micro <italic>F</italic><sub>1</sub>-score of 0.88, compared to RoBERTa&#x2019;s 0.83 and 0.83, respectively. This is consistent with the broader pretrained knowledge that LLMs bring to semantically complex clinical tasks, although the aggregate gains were modest (&#x2248;3&#x2010;5 weighted <italic>F</italic><sub>1</sub> points). The disparity was particularly pronounced for the more challenging temporal categories. RoBERTa failed entirely to identify &#x201C;unknown timing&#x201D; cases (<italic>F</italic><sub>1</sub>=0.0) and performed poorly on &#x201C;historical&#x201D; classifications (<italic>F</italic><sub>1</sub>=0.18), while Gemma3-27b achieved <italic>F</italic><sub>1</sub>-scores of 0.39 and 0.51 for these categories, respectively. This stark difference highlights a fundamental limitation of supervised approaches when training data are scarce. Both categories were rare in the corpus, with 77 of 1352 notes (5.7%) labeled as &#x201C;unknown&#x201D; and 90 of 1352 notes (6.7%) as &#x201C;historical.&#x201D; The performance of 2 leading models, Gemma3-27b and RoBERTa (n=1084), were compared using McNemar test for multilabel classifications with bootstrap analysis and the Benjamini-Hochberg false discovery rate method for multiple comparison correction. All details of the statistical comparison are shown in Section E of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>The RoBERTa model&#x2019;s difficulty with rare categories illustrates a well-known challenge in clinical NLP: obtaining sufficient annotated examples for every category is often impractical. Supervised learning typically requires many examples per class to achieve reliable performance, a requirement rarely met for infrequent but clinically important categories.</p><p>Gemma3-27b&#x2019;s relatively stronger performance on these rare categories, achieved through iterative prompt engineering on a labeled development set rather than gradient-based training, suggests that the model&#x2019;s pretrained knowledge provides a useful starting point for handling the long-tail distribution typical of clinical data. However, we note that this advantage was most evident in lower-frequency timing categories; aggregate gains over RoBERTa trained on the full dataset were modest. For reproducibility and technical details of model training, please refer to Section F of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s3-3"><title>Binary Classification for Recent Self-Harm Detection</title><p>To evaluate real-world applicability, we reformulated our multilabel task as a binary classification problem focused on identifying recent self-harm, the most clinically actionable category.</p><p>This approach mirrors practical use cases where, for example, clinicians may need to identify patients requiring intervention. This approach provides a simplified and practical categorization, aimed at identifying individuals with a recent self-harm event. We combined the original labels into two categories: (1) &#x201C;Recent self-harm&#x201D;&#x2014;cases with confirmed self-harm occurring within 90 days (n=252), and (2) &#x201C;Other events&#x201D;&#x2014;all remaining cases, including absent self-harm, historical events, or unknown timing (n=1100), as shown in the data flow chart in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Binary relabeling of multilabel annotated examples.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="mental_v13i1e87586_fig03.png"/></fig><p>We maintained identical train-test splits to ensure fair comparison between a baseline RoBERTa and Gemma3-27b models. The baseline RoBERTa model was retrained from scratch using binary cross-entropy loss optimized for the new labels. For Gemma3-27b, we retained the original multilabel prompt, then programmatically converted its structured output: cases labeled as both &#x201C;self-harm present&#x201D; AND &#x201C;recent&#x201D; were classified as &#x201C;Recent self-harm&#x201D;; all other label combinations mapped to &#x201C;Other events.&#x201D; The results are presented in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Performance of two models identifying recent self-harm. The alternative category &#x2018;Other events&#x2019; includes any combinations, such as unconfirmed self-harm, historical events of confirmed self-harm or self-harm where timing is unknown.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">n cases</td><td align="left" valign="bottom">Precision (95% CI)</td><td align="left" valign="bottom">Recall (95% CI)</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub> (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Recent self-harm</td><td align="left" valign="top">RoBERTa<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="top">51</td><td align="left" valign="top">0.79 (0.68-0.90)</td><td align="left" valign="top">0.82 (0.71-0.92)</td><td align="left" valign="top">0.81 (0.71-0.88)</td></tr><tr><td align="left" valign="top">Recent self-harm</td><td align="left" valign="top">Gemma3-27b</td><td align="left" valign="top">51</td><td align="left" valign="top">0.77 (0.64-0.89)</td><td align="left" valign="top">0.72 (0.60-0.84)</td><td align="left" valign="top">0.74 (0.64-0.83)</td></tr><tr><td align="left" valign="top">Other events</td><td align="left" valign="top">RoBERTa</td><td align="left" valign="top">201</td><td align="left" valign="top">0.96 (0.93-0.98)</td><td align="left" valign="top">0.95 (0.92-0.98)</td><td align="left" valign="top">0.95 (0.93-0.97)</td></tr><tr><td align="left" valign="top">Other events</td><td align="left" valign="top">Gemma3-27b</td><td align="left" valign="top">201</td><td align="left" valign="top">0.94 (0.90-0.97)</td><td align="left" valign="top">0.95 (0.92-0.98)</td><td align="left" valign="top">0.94 (0.92-0.96)</td></tr><tr><td align="left" valign="top">Average (micro)</td><td align="left" valign="top">RoBERTa</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="top">0.93 (0.89-0.96)</td><td align="left" valign="top">0.93 (0.89-0.96)</td><td align="left" valign="top">0.93 (0.89-0.96)</td></tr><tr><td align="left" valign="top">Average (micro)</td><td align="left" valign="top">Gemma3-27b</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.91 (0.87-0.94)</td><td align="left" valign="top">0.91 (0.87-0.94)</td><td align="left" valign="top">0.91 (0.87-0.94)</td></tr><tr><td align="left" valign="top">Average (weighted)</td><td align="left" valign="top">RoBERTa</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.93 (0.89-0.96)</td><td align="left" valign="top">0.93 (0.89-0.96)</td><td align="left" valign="top">0.93 (0.89-0.96)</td></tr><tr><td align="left" valign="top">Average (weighted)</td><td align="left" valign="top">Gemma3-27b</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.91 (0.87-0.94)</td><td align="left" valign="top">0.91 (0.87-0.94)</td><td align="left" valign="top">0.91 (0.87-0.94)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>RoBERTa: robustly optimized BERT pretraining approach.</p></fn><fn id="table4fn2"><p><sup>b</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>The results confirmed that privacy-preserving language models can effectively identify time-sensitive clinical events. While both models demonstrated strong performance for identifying recent self-harm, the re-trained RoBERTa model achieved marginally better performance with an average weighted <italic>F</italic><sub>1</sub>-score of 0.93 (95% CI 0.89-0.96) compared to Gemma3-27b &#x2019;s <italic>F</italic><sub>1</sub>-score of 0.91 (95% CI 0.87-0.94). For the dominant &#x201C;Other events&#x201D; category, both models performed well (<italic>F</italic><sub>1</sub>&#x003E;0.94) with almost identical performance.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, we aimed to evaluate whether privacy-preserving local language models could identify self-harm and its timing in secondary mental health records. The Gemma3-27b model, containing 27 billion parameters with a 128K context window, was quantized to 4-bit precision and deployed locally via Ollama, ensuring complete data privacy within the host health care provider&#x2019;s secure data infrastructure (in our case, the National Health Service). In a corpus of 1352 mental health clinical notes, Gemma3-27b outperformed a fine-tuned RoBERTa classifier on both detection of self-harm events and assignment of self-harm timing labels. The absolute <italic>F</italic><sub>1</sub> gain was modest for event detection (&#x2248;4%&#x2010;6%) but substantial for challenging &#x201C;historical&#x201D; and &#x201C;unknown&#x201D; timing categories (gains of 33% and &#x2265;39%, respectively). Performance on the &#x201C;recent&#x201D; category reached an <italic>F</italic><sub>1</sub>-score of 0.79 without gradient-based fine-tuning, although this result depended on systematic prompt development using a labeled development set. The largest relative improvements over RoBERTa were observed in the rarer timing categories, while aggregate gains were modest.</p><p>On the binary task of identifying recent self-harm, RoBERTa achieved a marginally higher weighted <italic>F</italic><sub>1</sub>-score (0.93) than Gemma3-27b (0.91), although both models performed strongly and were comparable on the dominant &#x201C;Other events&#x201D; class. This indicates that supervised classifiers can be highly effective when the classification task is simplified and sufficient labeled data exists. The relative advantage of the prompt-based approach is most evident in the multilabel temporal setting, particularly for rarer timing categories where supervised models struggle without large per-class annotation volumes.</p></sec><sec id="s4-2"><title>Comparison With Previous Work</title><p>Ayre et al [<xref ref-type="bibr" rid="ref15">15</xref>] developed a hybrid rule-based NLP tool using spaCy to identify perinatal self-harm in EHRs from the South London and Maudsley NHS Foundation Trust, achieving micro-averaged <italic>F</italic><sub>1</sub>-scores greater than 0.8 for span, polarity, and temporality detection. However, their approach required extensive manual feature engineering, custom tokenization rules, and lexicon development. Similar to our findings, they reported temporality as the most challenging attribute (<italic>&#x03BA;</italic>=0.62) and successfully used a heuristic requiring 2 or more mentions for patient-level classification. While their rule-based system performed well, it required 13 manually curated lexicons and complex grammatical rules, highlighting the engineering burden of traditional NLP approaches. In contrast, our prompt-based Gemma3-27b achieved comparable or superior performance without task-specific feature engineering, demonstrating the efficiency gains of using modern LLMs. It is worth noting that both studies identified the same clinical challenge: the ambiguity in temporal expressions within clinical documentation, suggesting that this represents a fundamental limitation in how clinicians record self-harm events, rather than a purely technical challenge.</p></sec><sec id="s4-3"><title>Clinical and Public Health Implications</title><p>Accurate ascertainment of self-harm is crucial for improving self-harm surveillance, evaluating services, and testing new interventions designed to support people who self-harm. It is also vital for identifying individuals in need of support. As the Gemma3-27b model requires relatively low computing resources, it can be deployed on in-house GPUs within a health care provider&#x2019;s secure data infrastructure. This mitigates concerns about the use of &#x201C;as a service&#x201D; proprietary language models hosted outside the provider&#x2019;s own infrastructure where inference using prompting with patient-level data cannot be guaranteed to be consistent with relevant and territory-specific statutory regulations. The approach demonstrated here, namely locally developed and quantized language models deployed within a secure data environment, establishes the technical feasibility of privacy-preserving self-harm detection. Potential future applications include batch or near-real-time processing pipelines, clinical dashboards, and pseudonymized analytics. However, evaluation of operational feasibility, governance workflows, clinician-review safeguards, and scalability is beyond the scope of this study and would require dedicated implementation and prospective validation studies. While prompt-based approaches may reduce, though not eliminate, the need for large volumes of annotated training data and may facilitate adaptation to related clinical tasks such as method-specific self-harm detection, suicidal ideation, or protective factors, these extensions remain speculative and require empirical validation.</p></sec><sec id="s4-4"><title>Utility and Potential Applications of This Tool and Its Future Iterations</title><p>Self-harm is often imprecisely captured across settings, including in the United Kingdom. This tool can support efforts to improve the monitoring of self-harm within clinical populations where such information is recorded narratively. Reliable identification and tracking of self-harm over time can provide valuable insights into temporal trends of self-harm and help assess the impact of public health policies or broader societal events [<xref ref-type="bibr" rid="ref39">39</xref>].</p><p>Such a tool could further facilitate the identification of individuals who have recently self-harmed and may be candidates for pharmacological or psychological interventions, enabling the recruitment of representative and diverse patient samples. Moreover, given that self-harm is a key outcome in mental health care, the tool can facilitate the extraction of such information to assess the impact of targeted interventions.</p><p>Systematic and reliable identification of self-harm in EHRs is also important for estimating the burden of self-harm within clinical settings, which is essential for service planning and the allocation of resources. Similarly, it can contribute to better quality self-harm research, especially where research questions require establishing the timing of self-harm to conduct longitudinal analyses.</p></sec><sec id="s4-5"><title>Qualitative Error Analysis</title><p>To improve transparency about model limitations, we conducted a qualitative examination of Gemma3-27b misclassifications on the held-out test set, identifying 5 recurring failure modes (detailed in Section G of <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> with synthetic clinical-note examples constructed by the clinical team for governance compliance). These included false-positive self-harm detection, where templated risk-assessment language was mistaken for a confirmed act; false-negative detection, where self-harm described briefly within longer psychosocial narratives was overlooked; false-negative recency, where vague temporal expressions (eg, &#x201C;a few weeks ago&#x201D;) were defaulted to nonrecent despite falling within the 90-day window; false-positive recency, where present-tense clinical concern led the model to override explicit historical date markers; and false-positive unknown timing, where indirect but sufficient temporal cues (eg, age-based reasoning) were not integrated. A substantial proportion of these errors arose from genuine ambiguity in clinical documentation, contexts where even expert annotators required deliberation, rather than purely technical shortcomings. These findings highlight the importance of expert review of all model outputs prior to any operational deployment, and of continuous monitoring for potential data and model drift as documentation practices, clinical populations, or language model versions evolve.</p></sec><sec id="s4-6"><title>Strengths and Limitations</title><p>All computation occurred behind the Oxford Health NHS Foundation Trust firewall with no data egress, adhering to the relevant general data protection regulation and UK Data Security &#x0026; Protection Toolkit standards. Two domain experts produced a high-quality gold standard with very good &#x03BA;=0.86 for event detection. The study used identical splits and evaluation metrics for both the Gemma3-27b language model and the RoBERTa model, isolating the effect of model architecture. A single, relatively low-cost NC-T4 node (16 GB GPU) demonstrates broad feasibility across publicly funded health care settings, such as the NHS in the United Kingdom, without requiring large-scale high-performance computing infrastructure and a low carbon footprint.</p><p>This study has 6 main limitations. First, while demonstrating the feasibility of privacy-preserving language models for self-harm detection, we acknowledge limited model selection. We evaluated locally deployable models via Ollama (Llama 3.2, Mistral, Phi-4, Qwen 2, and various Gemma-3 variants), finding performance correlated with parameter count, consistent with established scaling laws [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. Optimal prompts varied substantially across models, reflecting differences in pretraining, corpora, and architecture [<xref ref-type="bibr" rid="ref42">42</xref>]. This model-specific sensitivity suggests Gemma3-27b may not represent optimal performance. We selected Gemma3-27b pragmatically as a representative high-performing model, balancing computational resources with demonstrating feasibility rather than identifying the optimal clinical deployment model. When using privacy-preserving, local quantized LLMs in applications similar to those in our study, it will be important to systematically evaluate different model architectures and prompting strategies.</p><p>Second, data for training and testing the models were sourced from a single region and secondary mental health care setting in England; therefore, external validity to other regions with different populations, or to primary and acute care settings, was not assessed. Of note, the nature of underlying data differs substantially between secondary mental health care (the data used in our study) and primary or acute care settings due to variations in clinical practice. For example, acute hospitals rely more heavily on structured clinical coding to record patient presentations and encounters involving self-harm, whereas mental health care data place greater emphasis on narrative psychosocial formulations of historical and current self-harm and its management. For these reasons, we would expect that, across the United Kingdom&#x2019;s secondary mental health care system, the presented model would show limited variation in performance for the self-harm task described here, as services share a similar culture of practice and use EHRs with comparable functionality. However, a substantially different model would likely be required to address the same task in acute or primary care EHRs.</p><p>Third, a potential selection effect arises from using a Gemma3-4b model for candidate-note identification and a Gemma3-27b model as the primary evaluation model. Although these models differ substantially in parameter count (4 billion vs 27 billion) and the screening step was used solely to make annotation feasible, it did not generate gold-standard labels, and no model parameters were updated based on screening outputs, both models belong to the same architectural family. If Gemma-family models share systematic biases in what they flag as self-harm-related, the evaluation corpus could, in principle, contain a distributional signature that favors Gemma3-27b over architecturally different models such as RoBERTa. Two observations mitigate this concern: (1) the screening step removed many &#x201C;easy negatives&#x201D; (eg, administrative notes), yielding a harder evaluation set containing more ambiguous cases, which are consistent with the study&#x2019;s clinical aims; and (2) RoBERTa achieved strong overall performance and marginally outperformed Gemma3-27b on the simplified binary task, which would not be expected under strong architectural bias. Nevertheless, a residual distributional effect cannot be excluded. Future validation should include sensitivity analyses, such as alternative screening strategies (eg, keyword-based or clinician-led) or a supplementary truly random annotated sample, to quantify the magnitude of any selection effect.</p><p>Fourth, because the annotated corpus is enriched for potential self-harm content, the class distribution does not reflect the prevalence that would be encountered when the model is deployed across all clinical notes in routine care. In a low-prevalence setting, even a model with high specificity can generate a nontrivial number of false positives at scale, increasing clinician review burden and potentially undermining trust. Prospective evaluation under true-prevalence conditions, prevalence-aware calibration or thresholding strategies, and clinician-in-the-loop workflows in which every model output is reviewed before any clinical action are essential prerequisites for operational deployment.</p><p>Fifth, this study did not evaluate operational deployment considerations. Questions relating to batch versus near-real-time processing architectures, governance frameworks for automated flagging, human-in-the-loop safeguards, and real-world performance under routine clinical conditions were beyond the scope of the present work and constitute essential future research.</p><p>Sixth, many patients had a long-documented history of contact with secondary mental health services. Although annotators were instructed to treat each clinical note as a standalone document, independent of any previous decisions made about previous extracts from the same patient, this may not have been fully achievable in practice. As a result, some annotations may have been influenced by broader impressions of the patient rather than by information explicitly present in the text being analyzed. Fourth, historical (90 of 1352, 6.7%) and unknown (77 of 1352, 5.7%) timing cases were underrepresented, inflating CIs despite bootstrap resampling. Future work should use active-learning strategies to enrich rare labels. Fifth, the 90-day threshold, while pragmatic, may not entirely align with all clinical use cases; finer-grained temporal consensus on recency remains challenging. However, the recency threshold could be readily changed by amending the prompt.</p></sec><sec id="s4-7"><title>Conclusions</title><p>This work demonstrates the technical feasibility of using a privacy-preserving, locally deployable language model within a secure NHS data infrastructure to identify self-harm and its timing. Without gradient-based fine-tuning, but with systematic prompt development on a labeled development set, Gemma3-27b matched or exceeded a fine-tuned RoBERTa classifier, with the largest gains in challenging, lower-frequency timing categories and modest aggregate improvements. On a simplified binary task, RoBERTa performed marginally better, highlighting that the choice of approach should be guided by the specific clinical task and available annotation resources. These findings establish a proof of concept; clinical deployment would require multicenter validation across geographically diverse areas and populations, prospective evaluation of operational workflows (including clinician-review safeguards, governance frameworks, and false-positive management), implementation research with stakeholders, and rigorous monitoring for model drift and unintended bias. Such studies are the critical next steps to translate privacy-preserving language models into improved self-harm surveillance and patient confidentiality.</p></sec></sec></body><back><ack><p>We would like to thank the collaborators on the National Institute for Health and Care Research-Applied Research Collaboration application, Professor Keith Hawton, Professor Andrea Cipriani, and Professor Seena Fazel, as well as Liz Bale for her contribution to annotating the data. We would also like to acknowledge the work and support of the Oxford Research Informatics Team, Adam Pill, Acting Joint Head of Research Informatics, Suzanne Fisher, Research Informatics Systems Analyst and Lulu Kane Research Informatics Administrator and Tanya Smith. The views expressed are those of the authors and not necessarily those of the UK National Health Service, the National Institute for Health and Care Research or the UK Department of Health and Social Care.</p></ack><notes><sec><title>Funding</title><p>This research was funded by the National Institute for Health and Care Research (NIHR) Applied Research Collaboration Oxford and Thames Valley, at Oxford Health NHS Foundation Trust. It was also funded by NIHR Oxford Health Biomedical Research Centre (NIHR203316) and supported by Oxford Health NHS Foundation Trust Research Informatics Team. The study was carried out using OHFT electronic patient records within the Akrivia CRIS research platform environment, owned by Akrivia Health (akriviahealth.com). GG and RB were supported by funding from Australia&#x2019;s National Health and Medical Research Council (NHMRC) awarded to RB (#GNT2008073). GG was supported in part by the Department of Health and Social Care through a grant for the Multicentre Study of Self-harm in England. RB receives salary support from the Better Health &#x0026; Care Hub at King&#x2019;s College London, UK.AK and DWJ were supported in part by the NIHR AI Award for Health and Social Care (AI_AWARD02183), AK by a research grant from GlaxoSmithKline. DWJ was in part supported by the Office for Life Sciences and the National Institute for Health and Care Research (NIHR) Mental Health Translational Research Collaboration Mission, hosted by the NIHR Oxford Health Biomedical Research Centre.NK is supported by the National Institute for Health and Care Research Greater Manchester Patient Safety Research Collaboration (NIHR204295), the University of Manchester and Mersey Care NHS Foundation.</p></sec><sec><title>Data Availability</title><p>The data used in this work are owned by Oxford Health NHS Foundation Trust and accessed through CRIS Powered by Akrivia Health, using anonymized patient records. The data cannot be made publicly available but can be accessed with permissions from Oxford Health NHS Foundation Trust for UK NHS staff and UK academics within a secure firewall, in the same manner as the authors.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: GG, AK, and AT</p><p>Methodology: GG and AK</p><p>Data acquisition: GG, AK, and AT</p><p>Formal analysis: GG and AK</p><p>Draft manuscript: GG and AK</p><p>Review and editing: GG, AK, AT, RB, DJ, and NK</p><p>All authors have read and agreed to the published version of the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb2">CRIS</term><def><p>Clinical Record Interactive Search</p></def></def-item><def-item><term id="abb3">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb4">GPU</term><def><p>graphical processing unit</p></def></def-item><def-item><term id="abb5"><italic>ICD-10</italic></term><def><p><italic>International Statistical Classification of Diseases, Tenth Revision</italic></p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">NHS</term><def><p>National Health Service</p></def></def-item><def-item><term id="abb8">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb9">RoBERTa</term><def><p>robustly optimized BERT pretraining approach</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>Self-harm: assessment, management and preventing recurrence</article-title><source>National Institute for Health and Care Excellence (NICE)</source><year>2022</year><access-date>2026-03-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nice.org.uk/guidance/ng225">https://www.nice.org.uk/guidance/ng225</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>Suicides in England and Wales</article-title><source>Office for National Statistics</source><year>2024</year><access-date>2026-03-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/suicidesintheunitedkingdomreferencetables">https://www.ons.gov.uk/peoplepopulationandcommunity/birthsdeathsandmarriages/deaths/datasets/suicidesintheunitedkingdomreferencetables</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsiachristas</surname><given-names>A</given-names> </name><name name-style="western"><surname>Geulayov</surname><given-names>G</given-names> </name><name name-style="western"><surname>Casey</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Incidence and general hospital costs of self-harm across England: estimates based on the multicentre study of self-harm</article-title><source>Epidemiol Psychiatr Sci</source><year>2020</year><month>03</month><day>12</day><volume>29</volume><fpage>e108</fpage><pub-id pub-id-type="doi">10.1017/S2045796020000189</pub-id><pub-id pub-id-type="medline">32160934</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Geulayov</surname><given-names>G</given-names> </name><name name-style="western"><surname>Casey</surname><given-names>D</given-names> </name><name name-style="western"><surname>McDonald</surname><given-names>KC</given-names> </name><etal/></person-group><article-title>Incidence of suicide, hospital-presenting non-fatal self-harm, and community-occurring non-fatal self-harm in adolescents in England (the iceberg model of self-harm): a retrospective study</article-title><source>Lancet Psychiatry</source><year>2018</year><month>02</month><volume>5</volume><issue>2</issue><fpage>167</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.1016/S2215-0366(17)30478-9</pub-id><pub-id pub-id-type="medline">29246453</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bergen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hawton</surname><given-names>K</given-names> </name><name name-style="western"><surname>Waters</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Premature death after self-harm: a multicentre cohort study</article-title><source>Lancet</source><year>2012</year><month>11</month><day>3</day><volume>380</volume><issue>9853</issue><fpage>1568</fpage><lpage>1574</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(12)61141-6</pub-id><pub-id pub-id-type="medline">22995670</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carroll</surname><given-names>R</given-names> </name><name name-style="western"><surname>Metcalfe</surname><given-names>C</given-names> </name><name name-style="western"><surname>Gunnell</surname><given-names>D</given-names> </name></person-group><article-title>Hospital presenting self-harm and risk of fatal and non-fatal repetition: systematic review and meta-analysis</article-title><source>PLoS ONE</source><year>2014</year><volume>9</volume><issue>2</issue><fpage>e89944</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0089944</pub-id><pub-id pub-id-type="medline">24587141</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Geulayov</surname><given-names>G</given-names> </name><name name-style="western"><surname>Casey</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bale</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Suicide following presentation to hospital for non-fatal self-harm in the multicentre study of self-harm: a long-term follow-up study</article-title><source>Lancet Psychiatry</source><year>2019</year><month>12</month><volume>6</volume><issue>12</issue><fpage>1021</fpage><lpage>1030</lpage><pub-id pub-id-type="doi">10.1016/S2215-0366(19)30402-X</pub-id><pub-id pub-id-type="medline">31706930</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mars</surname><given-names>B</given-names> </name><name name-style="western"><surname>Heron</surname><given-names>J</given-names> </name><name name-style="western"><surname>Crane</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Clinical and social outcomes of adolescent self harm: population based birth cohort study</article-title><source>BMJ</source><year>2014</year><month>10</month><day>21</day><volume>349</volume><fpage>g5954</fpage><pub-id pub-id-type="doi">10.1136/bmj.g5954</pub-id><pub-id pub-id-type="medline">25335825</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Borschmann</surname><given-names>R</given-names> </name><name name-style="western"><surname>Becker</surname><given-names>D</given-names> </name><name name-style="western"><surname>Coffey</surname><given-names>C</given-names> </name><etal/></person-group><article-title>20-year outcomes in adolescents who self-harm: a population-based cohort study</article-title><source>Lancet Child Adolesc Health</source><year>2017</year><month>11</month><volume>1</volume><issue>3</issue><fpage>195</fpage><lpage>202</lpage><pub-id pub-id-type="doi">10.1016/S2352-4642(17)30007-X</pub-id><pub-id pub-id-type="medline">30169168</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clements</surname><given-names>C</given-names> </name><name name-style="western"><surname>Turnbull</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hawton</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Rates of self-harm presenting to general hospitals: a comparison of data from the multicentre study of self-harm in England and hospital episode statistics</article-title><source>BMJ Open</source><year>2016</year><month>02</month><day>16</day><volume>6</volume><issue>2</issue><fpage>e009749</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2015-009749</pub-id><pub-id pub-id-type="medline">26883238</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>Hospital episode statistics (HES)</article-title><source>NHS, England Digital</source><access-date>2026-03-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://digital.nhs.uk/data-and-information/data-tools-and-services/data-services/hospital-episode-statistics">https://digital.nhs.uk/data-and-information/data-tools-and-services/data-services/hospital-episode-statistics</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mazzali</surname><given-names>C</given-names> </name><name name-style="western"><surname>Duca</surname><given-names>P</given-names> </name></person-group><article-title>Use of administrative data in healthcare research</article-title><source>Intern Emerg Med</source><year>2015</year><month>06</month><volume>10</volume><issue>4</issue><fpage>517</fpage><lpage>524</lpage><pub-id pub-id-type="doi">10.1007/s11739-015-1213-9</pub-id><pub-id pub-id-type="medline">25711312</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schneeweiss</surname><given-names>S</given-names> </name><name name-style="western"><surname>Avorn</surname><given-names>J</given-names> </name></person-group><article-title>A review of uses of health care utilization databases for epidemiologic research on therapeutics</article-title><source>J Clin Epidemiol</source><year>2005</year><month>04</month><volume>58</volume><issue>4</issue><fpage>323</fpage><lpage>337</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2004.10.012</pub-id><pub-id pub-id-type="medline">15862718</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fernandes</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Dutta</surname><given-names>R</given-names> </name><name name-style="western"><surname>Velupillai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sanyal</surname><given-names>J</given-names> </name><name name-style="western"><surname>Stewart</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chandran</surname><given-names>D</given-names> </name></person-group><article-title>Identifying suicide ideation and suicidal attempts in a psychiatric clinical research database using natural language processing</article-title><source>Sci Rep</source><year>2018</year><month>05</month><day>9</day><volume>8</volume><issue>1</issue><fpage>7426</fpage><pub-id pub-id-type="doi">10.1038/s41598-018-25773-2</pub-id><pub-id pub-id-type="medline">29743531</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayre</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bittar</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Verma</surname><given-names>S</given-names> </name><name name-style="western"><surname>Howard</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Dutta</surname><given-names>R</given-names> </name></person-group><article-title>Developing a natural language processing tool to identify perinatal self-harm in electronic healthcare records</article-title><source>PLoS ONE</source><year>2021</year><volume>16</volume><issue>8</issue><fpage>e0253809</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0253809</pub-id><pub-id pub-id-type="medline">34347787</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stene-Larsen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Reneflot</surname><given-names>A</given-names> </name></person-group><article-title>Contact with primary and mental health care prior to suicide: a systematic review of the literature from 2000 to 2017</article-title><source>Scand J Public Health</source><year>2019</year><month>02</month><volume>47</volume><issue>1</issue><fpage>9</fpage><lpage>17</lpage><pub-id pub-id-type="doi">10.1177/1403494817746274</pub-id><pub-id pub-id-type="medline">29207932</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fraile Navarro</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ijaz</surname><given-names>K</given-names> </name><name name-style="western"><surname>Rezazadegan</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Clinical named entity recognition and relation extraction using natural language processing of medical free text: a systematic review</article-title><source>Int J Med Inform</source><year>2023</year><month>09</month><volume>177</volume><fpage>105122</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105122</pub-id><pub-id pub-id-type="medline">37295138</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Agrawal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hegselmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sontag</surname><given-names>D</given-names> </name></person-group><article-title>Large language models are few-shot clinical information extractors</article-title><conf-name>Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 7-11, 2022</conf-date><pub-id pub-id-type="doi">10.18653/v1/2022.emnlp-main.130</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>BioGPT: generative pre-trained transformer for biomedical text generation and mining</article-title><source>Brief Bioinformatics</source><year>2022</year><month>11</month><day>19</day><volume>23</volume><issue>6</issue><pub-id pub-id-type="doi">10.1093/bib/bbac409</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bi</surname><given-names>W</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name></person-group><article-title>Event extraction as machine reading comprehension</article-title><year>2020</year><conf-name>Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)</conf-name><conf-date>Nov 16-20, 2020</conf-date><conf-loc>Online</conf-loc><fpage>1641</fpage><lpage>1651</lpage><pub-id pub-id-type="doi">10.18653/v1/2020.emnlp-main.128</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thapa</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shiwakoti</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>SB</given-names> </name><etal/></person-group><article-title>Large language models (LLM) in computational social science: prospects, current state, and challenges</article-title><source>Soc Netw Anal Min</source><year>2025</year><volume>15</volume><issue>1</issue><pub-id pub-id-type="doi">10.1007/s13278-025-01428-9</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Dubey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jauhri</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pandey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kadian</surname><given-names>A</given-names> </name><name name-style="western"><surname>Al-Dahle</surname><given-names>A</given-names> </name><name name-style="western"><surname>Letman</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The Llama 3 herd of models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.21783</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lorge</surname><given-names>I</given-names> </name><name name-style="western"><surname>Joyce</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>N</given-names> </name><name name-style="western"><surname>Nevado-Holgado</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cipriani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kormilitzin</surname><given-names>A</given-names> </name></person-group><article-title>Detecting the clinical features of difficult-to-treat depression using synthetic data from large language models</article-title><source>Comput Biol Med</source><year>2025</year><month>08</month><volume>194</volume><fpage>110246</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2025.110246</pub-id><pub-id pub-id-type="medline">40499374</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wiest</surname><given-names>IC</given-names> </name><name name-style="western"><surname>Ferber</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Privacy-preserving large language models for structured medical information retrieval</article-title><source>NPJ Digit Med</source><year>2024</year><month>09</month><day>20</day><volume>7</volume><issue>1</issue><fpage>257</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01233-2</pub-id><pub-id pub-id-type="medline">39304709</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wiest</surname><given-names>IC</given-names> </name><name name-style="western"><surname>Verhees</surname><given-names>FG</given-names> </name><name name-style="western"><surname>Ferber</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Detection of suicidality from medical text using privacy-preserving large language models</article-title><source>Br J Psychiatry</source><year>2024</year><month>12</month><volume>225</volume><issue>6</issue><fpage>532</fpage><lpage>537</lpage><pub-id pub-id-type="doi">10.1192/bjp.2024.134</pub-id><pub-id pub-id-type="medline">39497458</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hawton</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bergen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Casey</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Self-harm in England: a tale of three cities</article-title><source>Soc Psychiat Epidemiol</source><year>2007</year><month>07</month><volume>42</volume><issue>7</issue><fpage>513</fpage><lpage>521</lpage><pub-id pub-id-type="doi">10.1007/s00127-007-0199-7</pub-id><pub-id pub-id-type="medline">17516016</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><article-title>Your strategic partner for neuroscience, dementia, neurology &#x0026; CNS -innovation</article-title><source>Akrivia Health</source><access-date>2026-03-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://akriviahealth.com">akriviahealth.com</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goodday</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Kormilitzin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vaci</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Maximizing the use of social and behavioural information from secondary care mental health electronic health records</article-title><source>J Biomed Inform</source><year>2020</year><month>07</month><volume>107</volume><fpage>103429</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2020.103429</pub-id><pub-id pub-id-type="medline">32387393</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bosma</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ichter</surname><given-names>B</given-names> </name><name name-style="western"><surname>Xia</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><year>2022</year><conf-name>36th Conference on Neural Information Processing Systems (NeurIPS 2022)</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><pub-id pub-id-type="doi">10.52202/068431-1800</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>KCC</given-names> </name></person-group><article-title>Towards reasoning in large language models: a survey</article-title><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Jul 9-14, 2023</conf-date><pub-id pub-id-type="doi">10.18653/v1/2023.findings-acl.67</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><etal/></person-group><article-title>The application of large language models in medicine: a scoping review</article-title><source>iScience</source><year>2024</year><month>05</month><day>17</day><volume>27</volume><issue>5</issue><fpage>109713</fpage><pub-id pub-id-type="doi">10.1016/j.isci.2024.109713</pub-id><pub-id pub-id-type="medline">38746668</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Gemma Team</collab><name name-style="western"><surname>Aishwarya Kamath</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Pathak</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Gemma 3 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 25, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.19786</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>Ollama (version 0.11.11)</article-title><source>Ollama</source><access-date>2026-04-02</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ollama.com/">https://ollama.com/</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Paszke</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gross</surname><given-names>S</given-names> </name><name name-style="western"><surname>Massa</surname><given-names>F</given-names> </name><name name-style="western"><surname>Lerer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bradbury</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chanan</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Pytorch: an imperative style, high-performance deep learning library</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 3, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1912.01703</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Reynolds</surname><given-names>L</given-names> </name><name name-style="western"><surname>McDonell</surname><given-names>K</given-names> </name></person-group><article-title>Prompt programming for large language models: beyond the few-shot paradigm</article-title><year>2021</year><month>05</month><day>8</day><conf-name>CHI &#x2019;21</conf-name><conf-loc>Yokohama Japan</conf-loc><fpage>1</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1145/3411763.3451760</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Hayashi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Neubig</surname><given-names>G</given-names> </name></person-group><article-title>Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing</article-title><source>ACM Comput Surv</source><year>2023</year><month>09</month><day>30</day><volume>55</volume><issue>9</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3560815</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>CRIS privacy notice</article-title><source>Oxford Health NHS Foundation Trust</source><access-date>2026-03-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.oxfordhealth.nhs.uk/research/toolkit/cris/cris-privacy-notice">https://www.oxfordhealth.nhs.uk/research/toolkit/cris/cris-privacy-notice</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name></person-group><source>Practical Statistics for Medical Research</source><year>1990</year><publisher-name>Chapman and Hall/CRC</publisher-name><pub-id pub-id-type="doi">10.1201/9780429258589</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moran</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chandler</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dudgeon</surname><given-names>P</given-names> </name><etal/></person-group><article-title>The Lancet Commission on self-harm</article-title><source>Lancet</source><year>2024</year><month>10</month><day>12</day><volume>404</volume><issue>10461</issue><fpage>1445</fpage><lpage>1492</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(24)01121-8</pub-id><pub-id pub-id-type="medline">39395434</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kaplan</surname><given-names>J</given-names> </name><name name-style="western"><surname>McCandlish</surname><given-names>S</given-names> </name><name name-style="western"><surname>Henighan</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Scaling laws for neural language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 23, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2001.08361</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hoffmann</surname><given-names>J</given-names> </name></person-group><article-title>Training compute-optimal large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 29, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.15556</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lester</surname><given-names>B</given-names> </name><name name-style="western"><surname>Al-Rfou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Constant</surname><given-names>N</given-names> </name></person-group><article-title>The power of scale for parameter-efficient prompt tuning</article-title><conf-name>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Nov 7-11, 2021</conf-date><conf-loc>Online and Punta Cana, Dominican Republic</conf-loc><fpage>3045</fpage><lpage>3059</lpage><pub-id pub-id-type="doi">10.18653/v1/2021.emnlp-main.243</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional tables, figures, and information.</p><media xlink:href="mental_v13i1e87586_app1.docx" xlink:title="DOCX File, 282 KB"/></supplementary-material></app-group></back></article>