<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Ment Health</journal-id><journal-id journal-id-type="publisher-id">mental</journal-id><journal-id journal-id-type="index">16</journal-id><journal-title>JMIR Mental Health</journal-title><abbrev-journal-title>JMIR Ment Health</abbrev-journal-title><issn pub-type="epub">2368-7959</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v11i1e53778</article-id><article-id pub-id-type="doi">10.2196/53778</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Generation of Backward-Looking Complex Reflections for a Motivational Interviewing&#x2013;Based Smoking Cessation Chatbot Using GPT-4: Algorithm Development and Validation</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Kumar</surname><given-names>Ash Tanuj</given-names></name><degrees>BSc, MASc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Cindy</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dong</surname><given-names>Alec</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Rose</surname><given-names>Jonathan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Faculty of Applied Science &#x0026; Engineering, University of Toronto</institution>, <addr-line>Toronto</addr-line><addr-line>ON</addr-line>, <country>Canada</country></aff><aff id="aff2"><institution>The Edward S Rogers Sr Department of Electrical and Computer Engineering, University of Toronto</institution>, <addr-line>10 King's College Road</addr-line><addr-line>Toronto</addr-line><addr-line>ON</addr-line>, <country>Canada</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Torous</surname><given-names>John</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Andrade</surname><given-names>Jackie</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Heffner</surname><given-names>Jaimee</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Siddals</surname><given-names>Steven</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jonathan Rose, PhD, The Edward S Rogers Sr Department of Electrical and Computer Engineering, University of Toronto, 10 King's College RoadToronto, ON, M5S 3G4, Canada, +1 416-978-6992; <email>Jonathan.Rose@utoronto.ca</email></corresp></author-notes><pub-date pub-type="collection"><year>2024</year></pub-date><pub-date pub-type="epub"><day>26</day><month>9</month><year>2024</year></pub-date><volume>11</volume><elocation-id>e53778</elocation-id><history><date date-type="received"><day>18</day><month>10</month><year>2023</year></date><date date-type="rev-recd"><day>29</day><month>03</month><year>2024</year></date><date date-type="accepted"><day>15</day><month>04</month><year>2024</year></date></history><copyright-statement>&#x00A9; Ash Tanuj Kumar, Cindy Wang, Alec Dong, Jonathan Rose. Originally published in JMIR Mental Health (<ext-link ext-link-type="uri" xlink:href="https://mental.jmir.org">https://mental.jmir.org</ext-link>), 26.9.2024. </copyright-statement><copyright-year>2024</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Mental Health, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://mental.jmir.org/">https://mental.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://mental.jmir.org/2024/1/e53778"/><abstract><sec><title>Background</title><p>Motivational interviewing (MI) is a therapeutic technique that has been successful in helping smokers reduce smoking but has limited accessibility due to the high cost and low availability of clinicians. To address this, the MIBot project has sought to develop a chatbot that emulates an MI session with a client with the specific goal of moving an ambivalent smoker toward the direction of quitting. One key element of an MI conversation is reflective listening, where a therapist expresses their understanding of what the client has said by uttering a <italic>reflection</italic> that encourages the client to continue their thought process. <italic>Complex</italic> reflections link the client&#x2019;s responses to relevant ideas and facts to enhance this contemplation. Backward-looking complex reflections (BLCRs) link the client&#x2019;s most recent response to a relevant selection of the client&#x2019;s previous statements. Our current chatbot can generate complex reflections<bold>&#x2014;</bold>but not BLCRs&#x2014;using large language models (LLMs) such as GPT-2, which allows the generation of unique, human-like messages customized to client responses. Recent advancements in these models, such as the introduction of GPT-4, provide a novel way to generate complex text by feeding the models instructions and conversational history directly, making this a promising approach to generate BLCRs.</p></sec><sec><title>Objective</title><p>This study aims to develop a method to generate BLCRs for an MI-based smoking cessation chatbot and to measure the method&#x2019;s effectiveness.</p></sec><sec sec-type="methods"><title>Methods</title><p>LLMs such as GPT-4 can be stimulated to produce specific types of responses to their inputs by &#x201C;asking&#x201D; them with an English-based description of the desired output. These descriptions are called <italic>prompts</italic>, and the goal of writing a description that causes an LLM to generate the required output is termed <italic>prompt engineering</italic>. We evolved an instruction to prompt GPT-4 to generate a BLCR, given the portions of the transcript of the conversation up to the point where the reflection was needed. The approach was tested on 50 previously collected MIBot transcripts of conversations with smokers and was used to generate a total of 150 reflections. The quality of the reflections was rated on a 4-point scale by 3 independent raters to determine whether they met specific criteria for acceptability.</p></sec><sec sec-type="results"><title>Results</title><p>Of the 150 generated reflections, 132 (88%) met the level of acceptability. The remaining 18 (12%) had one or more flaws that made them inappropriate as BLCRs. The 3 raters had pairwise agreement on 80% to 88% of these scores.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The method presented to generate BLCRs is good enough to be used as one source of reflections in an MI-style conversation but would need an automatic checker to eliminate the unacceptable ones. This work illustrates the power of the new LLMs to generate therapeutic client-specific responses under the command of a language-based specification.</p></sec></abstract><kwd-group><kwd>motivational interviewing</kwd><kwd>smoking cessation</kwd><kwd>therapy</kwd><kwd>automated therapy</kwd><kwd>natural language processing</kwd><kwd>large language models</kwd><kwd>GPT-4</kwd><kwd>chatbot</kwd><kwd>dialogue agent</kwd><kwd>reflections</kwd><kwd>reflection generation</kwd><kwd>smoking</kwd><kwd>cessation</kwd><kwd>ChatGPT</kwd><kwd>smokers</kwd><kwd>smoker</kwd><kwd>effectiveness</kwd><kwd>messages</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Smoking cessation therapists have long used the motivational interviewing (MI) talk therapy to guide clients toward positive behavioral change [<xref ref-type="bibr" rid="ref1">1</xref>]. MI engages clients in a structured conversation that encourages them to contemplate their behavior more deeply and motivates them to change it. MI has been shown to be successful in helping clients reduce or quit their smoking habits [<xref ref-type="bibr" rid="ref2">2</xref>], but the availability of MI-trained clinicians is limited to hospitals and medical centers, and MI therapy is usually only initiated after a smoking-related health issue occurs [<xref ref-type="bibr" rid="ref3">3</xref>]. These restrictions make it difficult for smokers to access therapy outside of medical centers and occur too late to have a preventative effect.</p><p>Our research seeks to automate the therapist side of an MI conversation which, if successful, could broaden access to care at a population level. We have been developing a chatbot, called MIBot [<xref ref-type="bibr" rid="ref4">4</xref>], whose purpose is to move ambivalent smokers toward the direction of quitting. MIBot is being developed by an interdisciplinary research collaboration among expert MI-trained clinicians, social scientists, and computer engineers. The initial version of the MIBot chatbot guides the client through a fairly simple MI conversation by combining scripted interactions with context-specific responses generated by natural language models, based on elements of the MI approach.</p><p>The focus of the initial version of the MIBot chatbot is on one core skill of MI: reflective listening [<xref ref-type="bibr" rid="ref1">1</xref>], in which the chatbot provides reflections on what the client has most recently said. In general, reflections are meant to express the therapist&#x2019;s current understanding of the client&#x2019;s most recent response and invite the client to continue further contemplation of their behavior. Reflections can be simple or complex [<xref ref-type="bibr" rid="ref1">1</xref>]. A simple reflection rephrases a client&#x2019;s response, sending the message that the response was understood and inviting the client to continue. A complex reflection attempts to infer relevant information about the client from the client&#x2019;s utterance by linking the client&#x2019;s response to relevant facts or ideas. A good quality complex reflection may further infer something about the emotional state of the client through their utterance.</p><p>In a complex reflection, when these relevant facts come from a client&#x2019;s earlier responses in the conversation, we call this a backward-looking complex reflection (BLCR). Preferably, a BLCR does not simply summarize all the past conversational information in order but is composed of the information that is sensible for the context. <xref ref-type="other" rid="box1">Textbox 1</xref> shows an example of a conversation in which the final statement by the therapist is a BLCR.</p><boxed-text id="box1"><title> Example motivational interviewing conversation in which the last utterance by the therapist is a backward-looking complex reflection.</title><p>Therapist: What is one thing you like about smoking?</p><p>Client: It makes me have less stress and keeps me connected to my friends.</p><p>Therapist: What is one thing you dislike about smoking?</p><p>Client: It leaves bad breath.</p><p>Therapist: What is one thing about your smoking addiction that you would like to change?</p><p>Client: I would like to reduce smoking.</p><p>Therapist: [backward-looking complex reflection] It seems like you want to reduce your smoking, which might help your concern about bad breath</p></boxed-text><p>The initial MIBot chatbot [<xref ref-type="bibr" rid="ref4">4</xref>] only generates reflections using the client&#x2019;s most recent utterance and does not make use of prior utterances. The ability to generate BLCRs can expand the chatbot&#x2019;s options for generating context-appropriate complex reflections.</p><p>The goal of this work is to develop and evaluate a method to automatically generate BLCRs given a prior conversation. It has become possible to do this kind of generation through recent dramatically powerful advancements in natural language processing [<xref ref-type="bibr" rid="ref5">5</xref>], and more specifically the most recent large language models (LLMs) from GPT-3.5 and later [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>LLMs are language models which take text as input and generate textual output. GPT-4, an LLM introduced in March 2023, has significantly improved capability to generate text to satisfy particular requirements compared to previous LLMs [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. One way to use GPT-4 is to write a <italic>prompt</italic>, which is a language-based instruction that literally tells the model the processing that is desired [<xref ref-type="bibr" rid="ref9">9</xref>]. This processing is potentially anything that can be described in language, which is a truly remarkable, new capability that will have many applications. We describe a method for developing the prompts needed to &#x201C;tell&#x201D; the model to create BLCRs.</p><p>This paper is organized as follows: the <italic>Prior Work</italic> section introduces MI, GPT-4, and the relevant parts of the MIBot project that we build on. The <italic>Methods</italic> section describes the prompt developed to generate a BLCR, the specific structure of the input to GPT-4, the rating scale developed to assess when a BLCR is acceptable, the experimental procedure to test the acceptability of BLCRs generated by the prompt, and the data used to test this procedure. The <italic>Results</italic> section provides the evaluation, and the <italic>Discussion</italic> section interprets the results of the experiment and lists limitations. The <italic>Conclusions</italic> section suggests avenues for further work.</p></sec><sec id="s1-2"><title>Prior Work</title><sec id="s1-2-1"><title>Motivational Interviewing</title><p>MI is a therapeutic technique in which a therapist engages in a conversation to guide and motivate clients who are ambivalent about their behaviors to move toward changing them [<xref ref-type="bibr" rid="ref1">1</xref>]. These guided conversations use 4 MI core skills: asking open-ended questions, providing reflections, affirmations, and summarization. In an MI conversation, the therapist will typically begin with an open-ended question, listen to the client&#x2019;s response, and reply with 1 of the other 3 core skill types, depending on the circumstances and the direction the therapist wishes to guide the conversation.</p><p>While all 4 core skill types are integral to a successful MI, we focus on the role of reflections and the related reflective listening. Reflective listening requires the therapist to listen to what the client has most recently said and formulate a response<italic>&#x2014;</italic>called a reflection<italic>&#x2014;</italic>that displays the therapist&#x2019;s understanding while also guiding the conversation. The content of a reflection depends on the current context of the conversation. Reflections can be divided into 2 types: simple reflections and complex reflections. Simple reflections restate the client&#x2019;s response, typically using different words, so that the therapist and client can establish that they are on the same page. Complex reflections allow the therapist to link what the client has most recently said to other facts or information about the client&#x2019;s life and emotional state, usually providing some kind of inference. Complex reflections are used to guide the conversation toward new topics.</p><p>MI has been shown to be a successful therapy for moving clients toward reducing their smoking habits [<xref ref-type="bibr" rid="ref2">2</xref>], and reflections in particular have been correlated with high perceived support for patient autonomy in MI sessions [<xref ref-type="bibr" rid="ref10">10</xref>].</p></sec><sec id="s1-2-2"><title>LLMs and GPT-4</title><p>LLMs are digital models of natural language that are able to generate text from an input by autoregressively predicting the next word in a given sequence [<xref ref-type="bibr" rid="ref6">6</xref>]. These models learn how to predict semantically and syntactically reasonable words by being trained to &#x201C;fill in the blanks&#x201D; on large amounts of diverse human-written text, which encompass questions and answers, web-based conversations, informative articles, and other kinds of digitized text. The wide range of data that LLMs are trained on have made them effective systems for generating solutions to various problems in the domain of natural language processing, such as answering questions, summarizing long text, and conversational dialogue generation [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>The GPT (Generative Pretrained Transformer) family of LLMs has proven to be state of the art in a number of general-purpose tasks [<xref ref-type="bibr" rid="ref8">8</xref>]. ChatGPT and the related GPT-4 [<xref ref-type="bibr" rid="ref9">9</xref>] model can generate human-like text and answer questions correctly to the point that it has successfully passed m any professional and academic examinations [<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>Due to the size of the model and the large amounts of human-produced textual data it is trained on, 1 emergent characteristic of the GPT-4 is its ability to answer questions and interpret human-readable text to follow instructions. This has led researchers to try and directly &#x201C;ask&#x201D; GPT-4 to generate some kind of desired text given some input. The study of ways to ask GPT-4 to generate desired text is a newly emerging field called prompt engineering. Thus, this asking process is called <italic>prompting</italic> a GPT-4 model, and these &#x201C;asks&#x201D; are typically called <italic>prompts</italic> [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>A prompt usually consists of a <italic>request</italic> of the model to generate or process some desired text, usually followed by requirements that the generated text must satisfy or instructions that tell the model how to generate this text [<xref ref-type="bibr" rid="ref11">11</xref>]. The request can optionally be followed by an <italic>input</italic>, with the goal that the model will use the input to process and generate the requested text (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Prompt example and generated result. The request is the first sentence, and the input is the italicized second line. The request and input message can be tested live on the OpenAI playground by copy and pasting the entire prompt text [<xref ref-type="bibr" rid="ref9">9</xref>].</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top"/><td align="left" valign="bottom">Prompt example and GPT-4&#x2013;generated result</td></tr></thead><tbody><tr><td align="left" valign="top">Prompt (request and input)</td><td align="left" valign="top">Given the Keywords below, write a paragraph that incorporates them into a story about a princess on the moon.<break/>Keywords: spaceship, cat, star, Pluto, Brazil</td></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top"><italic>Once upon a time, there lived a princess, unlike any other, on the moon. She had a spaceship that allowed her to travel across galaxies, making frequent visits to far-off planets. It was on one such trip that she found a stray cat wandering amidst the frozen nitrogen landscapes of Pluto, looking as lost and lonely as a star flickering bleakly in an infinite sea of black. She couldn&#x2019;t bear its sight and decided to take this feline friend along on her adventures. To cheer the cat up, they took spontaneous trips to Earth, her favorite being Brazil. Their spaceship would hover quietly over the vibrant Amazon forest as the princess and her cat would gaze down at the colorful montage of life. The brilliant night sky with millions of stars reflected in the cat&#x2019;s eyes reminded the princess that, just like the starlight, even the loneliest souls could brighten up the darkest corners of the universe.</italic></td></tr></tbody></table></table-wrap><p>GPT-3&#x2013; and GPT-4&#x2013;based prompting has been shown to be highly effective in generating text to solve various natural language processing tasks [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>] and has already found applications in a diverse set of technical fields. However, a prompted GPT model does not always produce factually correct answers [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. In addition, a prompted GPT model is not deterministic, and a single prompt may produce different texts each time that a prompt is used to generate a completion [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Recent research on prompt engineering has produced new methods to structure prompts for generating satisfactory texts [<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>The ability to prompt is not restricted by the architecture of GPT-3 or GPT-4. Prompting is possible with any LLM of similar structure, and the difference in output depends on how much knowledge and prediction capability has been retained by an LLM. Thus, while our work specifically used GPT-4, this paper&#x2019;s method can be used with any LLM, including future improvements on GPT-4, and we will indicate this by referring to LLMs broadly in our methods and discussions.</p></sec><sec id="s1-2-3"><title>Existing MI Smoking Cessation Chatbots and the MIBot Project</title><p>The research and development of MI-based chatbots across several therapeutic domains remains an open problem, with numerous approaches incorporating different natural language processing techniques, and nothing yet deployed in a commercial or therapeutic context for mass adoption. For MI focusing on smoking cessation, several research teams have independently developed chatbots that have been tested and evaluated on experimental study participants. Our particular work has focused on an early step in smoking cessation, which is moving ambivalent smokers toward the decision to quit smoking.</p><p>Almusharraf et al [<xref ref-type="bibr" rid="ref14">14</xref>] designed an MI chatbot, which used predefined answers in a scripted conversation and measured its effectiveness on clients&#x2019; confidence to quit smoking with an 11-point scale. After testing this method on 97 participants, they found that the average confidence among clients to quit smoking increased by 0.8 (<italic>P</italic>&#x003C;.001 via paired 1-tailed <italic>t</italic> test) 1 week after the conversation. The scripted nature of these MI conversations, with answers not unique to clients&#x2019; responses, was suggested as a future point of improvement to investigate further.</p><p>Independently, He et al [<xref ref-type="bibr" rid="ref15">15</xref>] sought to investigate whether chatbots using MI techniques had any differing effects from neutral chatbots. They designed 2 chatbots<italic>&#x2014;</italic>an MI-based chatbot and a neutral, affirming chatbot<italic>&#x2014;</italic>and found that while there were no significant differences in clients&#x2019; reception of the 2 chatbots, both chatbots increased the clients&#x2019; motivations to quit smoking. The conclusions of He et al [<xref ref-type="bibr" rid="ref15">15</xref>] combined with the results of Almusharraf et al [<xref ref-type="bibr" rid="ref14">14</xref>] indicate that nonscripted responses from chatbots may be better received.</p><p>The text produced by generative models are an alternative to scripted responses, and Shen et al [<xref ref-type="bibr" rid="ref16">16</xref>] displayed how generative models could generate reflections dependent on context. Using a GPT-2&#x2013;based architecture, they created unique, context-dependent generative responses by incorporating a combination of client and therapist utterances from an existing dialogue history, and drawing from a database of previous transcripts to help select between context-relevant responses based on semantic similarity. These generated reflections were compared to a seq2seq model baseline, an older model of conditional text generation that is not LLM based, and human evaluation using a 5-point Likert scale for absolute effectiveness. The generated reflections produced by this system were considered improvements over the baseline model using standard metrics such as the Recall-Oriented Understudy for Gisting Evaluation (ROUGE) score and, in terms of absolute effectiveness, were on-par or above ground truth reference reflections. These results indicate that custom reflections from generative models may be effective for MI-based smoking cessation chatbots to increase users&#x2019; confidence and motivation in quitting smoking.</p><p>To explore this possibility, Brown et al [<xref ref-type="bibr" rid="ref4">4</xref>] have been iteratively developing MIBot, an MI-based smoking cessation chatbot that uses GPT-2 to generate custom reflections. They tested 3 versions of the chatbot<italic>&#x2014;</italic>labeled v5.0, v5.1, and v5.2<italic>&#x2014;</italic>on independent groups of recruited smokers to measure the effect of GPT-2&#x2013;based generative reflections on moving smokers towards changing their smoking habits. They also used a version of the chatbot that did not generate reflections<italic>&#x2014;</italic>v4.7<italic>&#x2014;</italic>for comparison. MIBot v5.0, v5.1, and v5.2 asked 5 core questions, shown in <xref ref-type="other" rid="box2">Textbox 2</xref> in sequence, expected a participant response after each question, and used a pretrained GPT-2 model to generate a custom reflection. MIBot v5.2 added extra secondary questions after questions 1 and 2 to allow participants to follow-up with their initial responses to a core question, and a specific version of question 4 if the answer to question 3 was to reduce smoking. MIBot v4.7 also asked these questions, but responded with &#x201C;thank you&#x201D; to each response rather than generating a reflection.</p><boxed-text id="box2"><title> The 5 motivational interviewing conversational questions in the MIBot v5.2 conversation used in this paper.</title><list list-type="order"> <list-item><p>What is one thing you like about smoking?</p></list-item> <list-item><p>What is one thing you dislike about smoking?</p></list-item> <list-item><p>What is one thing about your smoking addiction that you would like to change?</p></list-item> <list-item><p>What will your life look like once you make this change?</p></list-item> <list-item><p>What is one step you need to make this change?</p></list-item> </list></boxed-text><p>The effect of MIBot versions on readiness to quit was measured using a numerical scale called the Readiness Ruler [<xref ref-type="bibr" rid="ref17">17</xref>]. Here, each participant was asked to rate their confidence, importance, and readiness to quit smoking from 0 to 10, with 10 indicating the highest value. Participants were asked to fill out the Readiness Ruler 3 times: just before, immediately after, and 1 week after the conversation with MIBot. Participants were also asked to score the perceived empathy of MIBot through the CARES (Consultation and Relational Empathy Survey) metric, a validated tool used to measure the perceived empathy of a health care interaction by asking a participant 10 statements that are each rated using a 6-point Likert scale [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>Brown et al [<xref ref-type="bibr" rid="ref4">4</xref>] found that there were statistically significant increases in participant confidence to quit smoking across all four chatbots 1 week after the conversation, with no statistically significant differences between them. This finding agreed with He et al&#x2019;s [<xref ref-type="bibr" rid="ref15">15</xref>] results, and Brown et al [<xref ref-type="bibr" rid="ref4">4</xref>] posited that asking questions may be enough to evoke an impact on confidence to quit. Version v5.2 did display statistically significant increases in importance and readiness to quit smoking when the other versions did not. In addition, v5.2 did exhibit a statistically significant increase in perceived empathy compared to v4.7 (<italic>P</italic>=.004) on the CARE scale. Both results were in contrast to He et al&#x2019;s [<xref ref-type="bibr" rid="ref15">15</xref>] findings that there were no statistically significant differences between neutral and MI-style chatbot conversations, and Brown et al [<xref ref-type="bibr" rid="ref4">4</xref>] postulated that this may be due to the effect of v5.2&#x2019;s LLM-based generative reflections.</p><p>MIBot v5.0, v5.1, and v5.2 generate GPT-2&#x2013;based reflections that only use a participant&#x2019;s latest response. This precludes the generation of complex reflections that can refer to earlier responses in a conversation, which are the essential element of the BLCRs that are the focus of this paper. This work builds upon Brown et al&#x2019;s [<xref ref-type="bibr" rid="ref4">4</xref>] work by creating and evaluating a method to generate BLCRs using GPT-4.</p></sec></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>In this section, we describe the structure of the method used to generate BLCRs, the set of data we test our BLCR generation method on and how the resulting BLCRs are assessed.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>Ethical standards and approval directly follow those of Brown et al [<xref ref-type="bibr" rid="ref4">4</xref>] as per the use of the data in the experiments described in that paper. The research used to acquire that data was approved by the University of Toronto Research Ethics Board under protocol number 35567, amended June 29, 2022, and all participants provided consent before participating in the Brown et al [<xref ref-type="bibr" rid="ref4">4</xref>] study.</p></sec><sec id="s2-3"><title>BLCR Generation Structure</title><p>In a chatbot conversation with a client, the client&#x2019;s latest and previous responses, along with the questions that were asked to evoke those responses, are packaged into a text called the <italic>client message input</italic>. A set of instructions, called the <italic>BLCR prompt</italic>, tells an LLM how to generate a BLCR from the client message input. These 2 texts are used together to generate a BLCR.</p></sec><sec id="s2-4"><title>Client Message Input</title><p>The client message input (<xref ref-type="other" rid="box3">Textbox 3</xref>) consists of (1) conversation&#x2014;the sequence of therapist questions and client responses up to the client&#x2019;s response right before the therapist&#x2019;s latest question&#x2014;and (2) latest question-response&#x2014;the therapist&#x2019;s latest question and the client&#x2019;s latest response.</p><boxed-text id="box3"><title> A sample client message input.</title><p><bold>Conversation:</bold></p><p>Therapist: What is one thing you like about smoking?</p><p>Client: It makes me to be more relaxed and releases my tension levels</p><p>Therapist: What is one thing you dislike about smoking?</p><p>Client: It would be the number of cigarettes I smoke a day plus the affordability of cigarettes theses days</p><p>Therapist: What is one thing about your smoking habit that you would like to change?</p><p>Client: The number or quantity I smoke a week</p><p>Therapist: What will your life look like when you make this change?</p><p>Client: If I can reduce by smoking 2 cigarettes a day and I would have some extra cash to do other things</p><p><bold>Latest Question-Response:</bold></p><p>Therapist: What are the steps you need to make this change?</p><p>Client: I need to probably set a smoking schedule that I need to stick too and also find a hobby to keep me distracted from my cravings</p><p><bold>Backward-looking complex reflection:</bold></p></boxed-text><p>The client message input is unique to each client response, and so changes on every client response. An LLM processes this input to generate a BLCR by first processing the instructions given in the BLCR prompt.</p></sec><sec id="s2-5"><title>Prompt Design</title><p>The BLCR prompt, shown in <xref ref-type="other" rid="box4">Textbox 4</xref>, consists of (1) a request to generate a BLCR meeting the standards of MI, using terms presented in the client message input (see <italic>Client Message Input</italic> section); (2) a description of a complex reflection, taken from Miller and Rollnick [<xref ref-type="bibr" rid="ref1">1</xref>]; (3) constraints and criteria to ensure the generated text meets the criteria of a complex reflection; (4) constraints and criteria to ensure the generated text meets the criteria of a BLCR; and (5) repetition of the request to generate a BLCR, given the above constraints and criteria.</p><p>The BLCR prompt is the same regardless of the client input message used. The BLCR prompt draws upon an LLM&#x2019;s implicit domain knowledge of MI [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref11">11</xref>], combined with a specific definition of a complex reflection, and constraints and criteria on what the output must follow to be an acceptable BLCR. For each client message input, an LLM can use the BLCR prompt&#x2019;s guidelines to generate a BLCR.</p><boxed-text id="box4"><title> The full backward-looking complex reflection prompt.</title><p>Generate a "backward-looking complex reflection" on the "Latest Question-Response" that meets the standards for Motivational Interviewing from the given "Conversation" about smoking cessation.</p><p>Refer to the following operational definition of a complex reflection in the context of Motivational Interviewing (MI):</p><p>Reflective listening statements are made by the clinician in response to client statements. A reflection may introduce new meaning or material, but it essentially captures and returns to clients something about what they have just said. Reflections are further categorized as simple or complex reflections.</p><p>Complex reflections typically add substantial meaning or emphasis to what the client has said. These reflections serve the purpose of conveying a deeper or more complex picture of what the client has said. Sometimes the clinician may choose to emphasize a particular part of what the client has said to make a point or take the conversation in a different direction. Clinicians may add subtle or very obvious content to the client's words, or they may combine statements from the client to form complex summaries.</p><p>A complex reflection has these hard constraints:</p><list list-type="order"><list-item><p>A complex reflection must be a statement and not a question.</p></list-item><list-item><p>A complex reflection must not give advice or information without permission, even if this advice is helpful.</p></list-item><list-item><p>A complex reflection must not direct the client by giving orders or commands.</p></list-item><list-item><p>A complex reflection must not disagree or challenge what the client has said.</p></list-item><list-item><p>A complex reflection must not incentivize people to smoke more, or discourage people from quitting smoking.</p></list-item><list-item><p>A complex reflection must not be factually wrong about smoking.</p></list-item><list-item><p>A complex reflection must be grammatically correct.</p></list-item></list><p>Here are some additional hard constraints for backward-looking complex reflections:</p><list list-type="order"><list-item><p>A backward-looking complex reflection must directly reference the Client statement and the Therapist question it is responding to in the Latest Question-Response.</p></list-item><list-item><p>A backward-looking complex reflection must include only one piece of extra information from earlier client statements in the Conversation.</p></list-item><list-item><p>A backward-looking complex reflection must not summarize the conversation.</p></list-item><list-item><p>A backward-looking complex reflection must use what the client has said in the last client statement, and the information from earlier client statements, and infer something about the client.</p></list-item></list><p>Given all the context above, generate a backward-looking complex reflection on the "Latest Question-Response" from the given "Conversation" that meets the Motivational Interviewing criteria of a complex reflection and satisfies all above hard constraints.</p></boxed-text><p>The BLCR prompt was created through an iterative process. Starting with an initial description was set of rules describing a BLCR and the requirements to generate a BLCR. This initial prompt was used to generate reflections on preexisting conversational data from prior conversations. These reflections were evaluated using the scale described in the <italic>Evaluation of Quality of a BLCR</italic> section. The prompt was subsequently revised to improve the responses, and the method attempted again on another set of independent conversational data. The revisions consisted of additional constraints and guidance, written in English, to address the shortcomings of the generated reflections. This iterative process continued until a prompt of sufficiently high evaluation score of the generated reflections was was achieved. The following sections describe both the data and the scale used.</p></sec><sec id="s2-6"><title>Data</title><p>To test the BLCR prompt and client message inputs on real conversational data, 50 conversations were randomly selected from the MIBot version5.1 experiment data [<xref ref-type="bibr" rid="ref4">4</xref>]. Each conversation consisted of the 5 MIBot core questions shown in (<xref ref-type="other" rid="box2">Textbox 2</xref>), along with their respective participant responses. As described in Brown et al [<xref ref-type="bibr" rid="ref4">4</xref>], the participants were 50 anonymous volunteers from the Prolific platform who self-selected based on being current smokers. All 50 participants wrote their responses in text via the MIBot text-based chat interface. <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> provides a sample conversation. Using the BLCR prompt and client message input, BLCRs would be generated for responses to Q3, Q4, and Q5 for each conversation, giving a total of 150 candidate BLCRs to assess.</p></sec><sec id="s2-7"><title>Evaluation of Quality of a BLCR</title><p>A rating scale was developed to numerically evaluate the quality of a BLCR. This scale allows one to determine whether a BLCR is <italic>acceptable</italic>, that is, it meets the definition of a BLCR described in the <italic>Prior Work</italic> section.</p><p>The BLCR rating scale (<xref ref-type="other" rid="box5">Textbox 5</xref>) is an ordinal scale where higher number ratings successively include and build upon lower number ratings. If a BLCR achieves a rating of 3, this means it meets the criteria of 1 (referencing a client&#x2019;s latest response), 2 (referencing previous information in the conversation), and 3 (makes an inference about the client using present and past information). Satisfying these 3 requirements meets the definition of a BLCR as defined in the <italic>Prior Work</italic> section; therefore, we call any BLCRs rated 3 or greater acceptable BLCRs. A further rating of 4 is included to meet the preference for a &#x201C;good&#x201D; BLCR, which does not summarize the previous contents of the conversation, an optional condition that was deemed useful for indicating an unambiguous BLCR that exceeds the minimum acceptability requirements.</p><boxed-text id="box5"><title> The backward-looking complex reflection rating scale.</title><p><bold>1: does the output reference the client&#x2019;s latest response somewhere?</bold></p><list list-type="bullet"><list-item><p>the output contains 1 or more references to the client&#x2019;s latest response</p></list-item></list><p><bold>2: 1 + does the output reference some extra information from earlier in the conversation?</bold></p><list list-type="bullet"><list-item><p>the output contains 1 or more references to 1 or more previous client responses</p></list-item></list><p><bold>3: 2 + does the output make an inference about the client using information in criteria 1 and 2?</bold></p><list list-type="bullet"><list-item><p>the output generates 1 or more novel assumptions about the client using information in 1 and 2</p></list-item></list><p><bold>4: 3 + is the output not summarizing the sequence of the conversation word for word?</bold></p><list list-type="bullet"><list-item><p>the output does not repeat the information in each client response in sequence</p></list-item></list><p/><p><bold>Criteria to accept as a backward-looking complex reflection (score a 1 [True]):</bold> it is rated 3 or greater on the above rating scale.</p><p/></boxed-text><p>A Python script was written to parse 50 conversations and build a formatted client message input for every Q3, Q4, and Q5 conversational sequence, creating 150 total inputs. These were fed to an LLM alongside the BLCR prompt, and the LLM generated 150 candidate BLCRs.</p><p>Three human raters were deployed to use the criteria of the BLCR Rating Scale to independently score all 150 generated BLCRs as acceptable or unacceptable. Using a binary score, an acceptable BLCR was scored 1 (true) if it received a rating of 3 or greater on the BLCR Rating Scale, while an unacceptable BLCR was scored 0 (false). The binary scoring was used to determine the <italic>acceptability</italic>: the percentage of accepted BLCRs among all generated BLCRs. The interrater reliability between the binary scores of the 3 raters was assessed using percent agreement and the calculation of Cohen <italic>&#x03BA;</italic>. This metric was chosen specifically to measure interrater reliability with an ordinal scale, and was chosen instead of a similar metric such as Fleiss <italic>&#x03BA;</italic> due to the latter&#x2019;s unsuitability in a case where all raters rate all items, which is the case for this BLCR assessment experiment [<xref ref-type="bibr" rid="ref19">19</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>This section reports the fraction of the BLCRs generated using the evaluation method described in the <italic>Methods</italic> section that were deemed acceptable by each of the 3 human raters. The first section reports the percentage of accepted BLCRs between the 3 raters and between the 3 questions, along with a breakdown of the frequency of ranking scores per question and rater. The second section reports the interrater reliability between 3 pairs of the 3 raters (rater 1 and rater 2, rater 1 and rater 3, and rater 2 and rater 3) using percent agreement, with a brief discussion on the <italic>&#x03BA;</italic> results.</p></sec><sec id="s3-2"><title>BLCR Acceptability Statistics</title><p><xref ref-type="table" rid="table2">Table 2</xref> displays the percentage of BLCRs meeting the BLCR rating criteria as acceptable (BLCR rating of 3 or greater) broken down by the rater and the question. <xref ref-type="table" rid="table3">Table 3</xref> displays the frequency of rating ranks broken down by question and by rater.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Percentage of backward-looking complex reflections deemed acceptable by question and rater.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="5"/></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Q3 (n=50)</td><td align="left" valign="top">Q4 (n=50)</td><td align="left" valign="top">Q5 (n=50)</td><td align="left" valign="top">Total (N=150)</td></tr></thead><tbody><tr><td align="left" valign="top">Rater 1 (%)</td><td align="left" valign="top">92</td><td align="left" valign="top">90</td><td align="left" valign="top">96</td><td align="left" valign="top">93</td></tr><tr><td align="left" valign="top">Rater 2 (%)</td><td align="left" valign="top">73</td><td align="left" valign="top">90</td><td align="left" valign="top">88</td><td align="left" valign="top">84</td></tr><tr><td align="left" valign="top">Rater 3 (%)</td><td align="left" valign="top">90</td><td align="left" valign="top">88</td><td align="left" valign="top">86</td><td align="left" valign="top">88</td></tr><tr><td align="left" valign="top">Average acceptance (%)</td><td align="left" valign="top">85 (10)</td><td align="left" valign="top">89 (1)</td><td align="left" valign="top">90 (5)</td><td align="left" valign="top">88 (5)</td></tr></tbody></table></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Frequency of rating by question and rater.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Question and rater</td><td align="left" valign="bottom" colspan="5">Rating, n</td></tr><tr><td align="left" valign="bottom" colspan="2"/><td align="left" valign="bottom">0</td><td align="left" valign="bottom">1</td><td align="left" valign="bottom">2</td><td align="left" valign="bottom">3</td><td align="left" valign="bottom">4</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="7"><bold>Q3</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 1</td><td align="left" valign="top">4</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">3</td><td align="left" valign="top">44</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 2</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td><td align="left" valign="top">13</td><td align="left" valign="top">2</td><td align="left" valign="top">35</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 3</td><td align="left" valign="top">2</td><td align="left" valign="top">0</td><td align="left" valign="top">3</td><td align="left" valign="top">6</td><td align="left" valign="top">40</td></tr><tr><td align="left" valign="top" colspan="7"><bold>Q4</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 1</td><td align="left" valign="top">1</td><td align="left" valign="top">2</td><td align="left" valign="top">2</td><td align="left" valign="top">0</td><td align="left" valign="top">46</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 2</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td><td align="left" valign="top">4</td><td align="left" valign="top">0</td><td align="left" valign="top">46</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 3</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td><td align="left" valign="top">5</td><td align="left" valign="top">0</td><td align="left" valign="top">45</td></tr><tr><td align="left" valign="top" colspan="7"><bold>Q5</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 1</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td><td align="left" valign="top">49</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 2</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td><td align="left" valign="top">5</td><td align="left" valign="top">2</td><td align="left" valign="top">43</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 3</td><td align="left" valign="top">3</td><td align="left" valign="top">2</td><td align="left" valign="top">2</td><td align="left" valign="top">0</td><td align="left" valign="top">43</td></tr><tr><td align="left" valign="top" colspan="7"><bold>Total (all questions combined)</bold></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 1</td><td align="left" valign="top">6</td><td align="left" valign="top">2</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td><td align="left" valign="top">139</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 2</td><td align="left" valign="top">3</td><td align="left" valign="top">0</td><td align="left" valign="top">22</td><td align="left" valign="top">4</td><td align="left" valign="top">124</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 3</td><td align="left" valign="top">6</td><td align="left" valign="top">2</td><td align="left" valign="top">10</td><td align="left" valign="top">6</td><td align="left" valign="top">129</td></tr></tbody></table></table-wrap><p><xref ref-type="table" rid="table2">Table 2</xref> breaks down the percent of acceptable BLCRs by rater and question, and the total column indicates the percent of BLCRs scored acceptable across all 150 responses by a single rater. The percentages in parentheses indicate the SD of the acceptability percentage.</p><p>The combination of high acceptability (<xref ref-type="table" rid="table2">Table 2</xref>) and high frequency of &#x201C;4&#x201D; ratings (<xref ref-type="table" rid="table3">Table 3</xref>) indicates that the majority of BLCRs generated by this method were considered &#x201C;good&#x201D; among all 3 raters. This is an indication that the LLM GPT-4 is highly capable of generating a BLCR. <xref ref-type="supplementary-material" rid="app2">Multimedia Appendices 2</xref> and <xref ref-type="supplementary-material" rid="app3">3</xref> graph the frequencies of rating by question and rater, with both indicating a large skew toward &#x201C;4&#x201D; ratings.</p></sec><sec id="s3-3"><title>Interrater Reliability</title><p>To assess the agreement of the results provided in <xref ref-type="table" rid="table1">Tables 1</xref><xref ref-type="table" rid="table2"/>-<xref ref-type="table" rid="table3">3</xref>, <xref ref-type="table" rid="table4">Table 4</xref> displays the percent agreement and Cohen <italic>&#x03BA;</italic> for each rater pair. All 3 raters agreed on results at least 80% of the time.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Percent agreement and Cohen <italic>&#x03BA;</italic> for rater pairs.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top"/><td align="left" valign="top">Rater 1, rater 2</td><td align="left" valign="top">Rater 1, rater 3</td><td align="left" valign="top">Rater 2, rater 3</td></tr></thead><tbody><tr><td align="left" valign="top">Agreement (%)</td><td align="left" valign="top">84</td><td align="left" valign="top">88</td><td align="left" valign="top">80</td></tr><tr><td align="left" valign="top">Cohen <italic>&#x03BA;</italic></td><td align="left" valign="top">0.26</td><td align="left" valign="top">0.36</td><td align="left" valign="top">0.16</td></tr></tbody></table></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Altogether, the combination of high &#x201C;4&#x201D; frequency and a rating agreement of 80% and above indicates that this BLCR generation method can be expected to produce &#x201C;good&#x201D; BLCRs in the large majority of cases. In comparison, the <italic>&#x03BA;</italic> values (<xref ref-type="table" rid="table3">Table 3</xref>) indicated weak to fair agreement between all 3 pairs of raters, based on standard interpretation criteria of <italic>&#x03BA;</italic>. The discrepancy between high percentage agreement and weak to fair <italic>&#x03BA;</italic> may be due to the majority of BLCRs being rated &#x201C;4&#x201D; by all 3 raters. The lack of contrastive negative examples (very few generated BLCRs that were rated 0, 1, or 2) skews the calculation of <italic>&#x03BA;</italic> toward treating the labeling of widespread agreement as random chance. Therefore, percent agreement is thought to be a more realistic assessment of effectiveness in this context.</p><p><xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> contains an example of a real conversation from Brown et al [<xref ref-type="bibr" rid="ref4">4</xref>], with Brown et al&#x2019;s [<xref ref-type="bibr" rid="ref4">4</xref>] reflections (labeled MIBot [data]) and BLCRs generated by this paper&#x2019;s method (labeled MIBot [BLCR]) below those reflections. Overall, the BLCRs generated successfully iterate on Brown et al&#x2019;s [<xref ref-type="bibr" rid="ref4">4</xref>] provided reflections by better incorporating direct reflections on responses and linkages to previous responses to make inferences. A high-quality MI reflection would further infer about the emotional state of the client, and while the generated BLCRs are able to make rudimentary inferences about the mental state of the client (&#x201C;it seems that&#x2026;&#x201D;), more work may be necessary to turn these inferences into those of emotional states. The high percentage of accepted BLCRs shows promise in prompt-based methods being an effective technique for MIBot to generate complex reflections that incorporate information from the past.</p></sec><sec id="s4-2"><title>Limitations</title><p>The prompt-based BLCR generation method is restricted to MI conversations for smoking cessation and has only been tested in the context of 5-question MIBot conversations. Beyond this scope, this work may not generalize to other MI smoking cessation therapeutic contexts without changes to the prompt. However, the structure of the prompt itself is not specific to the data or the situation. The prompt can in theory be modified to remove references to smoking cessation and replace these with references to other domains, potentially offering a degree of domain generalizability across different subjects of MI therapy beyond smoking cessation. GPT-4 was the LLM model used in this work, but this method is applicable to any LLM model in theory. Newer LLM models, including future GPT models, may provide more robust results.</p></sec><sec id="s4-3"><title>Conclusions</title><p>This paper presented a method to use an LLM-based prompt to generate BLCRs for a version of MIBot&#x2019;s MI smoking cessation conversation. It provided a definition of a BLCR, a prompt used to generate BLCRs, and a BLCR rating scale to assess whether a BLCR is acceptable. We found that 88% (n=150) of the generated BLCRs were deemed acceptable. This paper extends the work of Brown et al [<xref ref-type="bibr" rid="ref4">4</xref>] by providing a method to generate complex reflections that incorporate information from earlier in the conversation, and uses GPT-4&#x2019;s strong text-generation capability rather than GPT-2.</p><p>Future work may build upon the definitions and methods introduced by this paper in three ways. First, the definition of a BLCR and the BLCR rating scale may be further refined to provide an accurate conceptual model of what the BLCR is trying to capture in a MI conversation. Second, the BLCR&#x2019;s prompt method can be adjusted to different MI therapeutic contexts beyond smoking cessation or refined to be more accurate for the smoking cessation context. Finally, the BLCR prompt method can be incorporated into MIBot, and its generated BLCRs can be assessed qualitatively and quantitatively in live experimental conversations.</p></sec></sec></body><back><ack><p>This research was funded by a Natural Sciences and Engineering Research Council of Canada Discovery grant (RGPIN-2019-04395) and the Edward S Rogers Sr Department of Electrical and Computer Engineering at the University of Toronto.</p></ack><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BLCR</term><def><p>backward-looking complex reflection</p></def></def-item><def-item><term id="abb2">CARES</term><def><p>Consultation and Relational Empathy Survey</p></def></def-item><def-item><term id="abb3">GPT</term><def><p>Generative Pretrained Transformer</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">MI</term><def><p>motivational interviewing</p></def></def-item><def-item><term id="abb6">ROUGE</term><def><p>Recall-Oriented Understudy for Gisting Evaluation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Rollnick</surname><given-names>S</given-names> </name></person-group><source>Motivational Interviewing: Helping People Change</source><year>2012</year><publisher-name>Guilford Press</publisher-name><pub-id pub-id-type="other">978-1-60918-227-4</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burke</surname><given-names>BL</given-names> </name><name name-style="western"><surname>Arkowitz</surname><given-names>H</given-names> </name><name name-style="western"><surname>Menchola</surname><given-names>M</given-names> </name></person-group><article-title>The efficacy of motivational interviewing: a meta-analysis of controlled clinical trials</article-title><source>J Consult Clin Psychol</source><year>2003</year><month>10</month><volume>71</volume><issue>5</issue><fpage>843</fpage><lpage>861</lpage><pub-id pub-id-type="doi">10.1037/0022-006X.71.5.843</pub-id><pub-id pub-id-type="medline">14516234</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kramer Schmidt</surname><given-names>L</given-names> </name><name name-style="western"><surname>Andersen</surname><given-names>K</given-names> </name><name name-style="western"><surname>S&#x00F8;gaard Nielsen</surname><given-names>A</given-names> </name></person-group><article-title>Differences in the delivery of motivational interviewing across three countries</article-title><source>J Ethn Subst Abuse</source><year>2022</year><month>07</month><volume>21</volume><issue>3</issue><fpage>823</fpage><lpage>844</lpage><pub-id pub-id-type="doi">10.1080/15332640.2020.1824838</pub-id><pub-id pub-id-type="medline">33032500</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Melamed</surname><given-names>O</given-names> </name><etal/></person-group><article-title>A motivational-interviewing chatbot with generative reflections for increasing readiness to quit among smokers</article-title><source>JMIR Ment Health</source><year>2023</year><month>10</month><day>17</day><volume>10</volume><fpage>e49132</fpage><pub-id pub-id-type="doi">10.2196/49132</pub-id><pub-id pub-id-type="medline">37847539</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Jurafsky</surname><given-names>D</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>J</given-names> </name></person-group><source>Speech and Language Processing: An Introduction to Natural Language Processing, Computational Linguistics, and Speech Recognition</source><year>2008</year><publisher-name>Prentice Hall</publisher-name></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Zhao</surname><given-names>WX</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>K</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 24, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.18223</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey on evaluation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 29, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.03109</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Radford</surname><given-names>A</given-names> </name><name name-style="western"><surname>Narasimhan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Salimans</surname><given-names>T</given-names> </name><name name-style="western"><surname>Sutskever</surname><given-names>I</given-names> </name></person-group><article-title>Improving language understanding by generative pre-training</article-title><source>Papers With Code</source><year>2018</year><access-date>2024-06-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://paperswithcode.com/paper/improving-language-understanding-by">https://paperswithcode.com/paper/improving-language-understanding-by</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><collab>OpenAI</collab><name name-style="western"><surname>Achiam</surname><given-names>J</given-names> </name><name name-style="western"><surname>Adler</surname><given-names>S</given-names> </name><etal/></person-group><article-title>GPT-4 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 4, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pollak</surname><given-names>KI</given-names> </name><name name-style="western"><surname>Alexander</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Tulsky</surname><given-names>JA</given-names> </name><etal/></person-group><article-title>Physician empathy and listening: associations with patient satisfaction and autonomy</article-title><source>J Am Board Fam Med</source><year>2011</year><volume>24</volume><issue>6</issue><fpage>665</fpage><lpage>672</lpage><pub-id pub-id-type="doi">10.3122/jabfm.2011.06.110025</pub-id><pub-id pub-id-type="medline">22086809</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Hayashi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Neubig</surname><given-names>G</given-names> </name></person-group><article-title>Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 28, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2107.13586</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>TB</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ryder</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Language models are few-shot learners</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 22, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2005.14165</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="preprint"><person-group person-group-type="author"><name name-style="western"><surname>White</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Hays</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A prompt pattern catalog to enhance prompt engineering with ChatGPT</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 21, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.11382</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Almusharraf</surname><given-names>F</given-names> </name><name name-style="western"><surname>Rose</surname><given-names>J</given-names> </name><name name-style="western"><surname>Selby</surname><given-names>P</given-names> </name></person-group><article-title>Engaging unmotivated smokers to move toward quitting: design of motivational interviewing-based chatbot through iterative interactions</article-title><source>J Med Internet Res</source><year>2020</year><month>11</month><day>3</day><volume>22</volume><issue>11</issue><fpage>e20251</fpage><pub-id pub-id-type="doi">10.2196/20251</pub-id><pub-id pub-id-type="medline">33141095</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><name name-style="western"><surname>Basar</surname><given-names>E</given-names> </name><name name-style="western"><surname>Wiers</surname><given-names>RW</given-names> </name><name name-style="western"><surname>Antheunis</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Krahmer</surname><given-names>E</given-names> </name></person-group><article-title>Can chatbots help to motivate smoking cessation? A study on the effectiveness of motivational interviewing on engagement and therapeutic alliance</article-title><source>BMC Public Health</source><year>2022</year><month>04</month><day>12</day><volume>22</volume><issue>1</issue><fpage>726</fpage><pub-id pub-id-type="doi">10.1186/s12889-022-13115-x</pub-id><pub-id pub-id-type="medline">35413887</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Shen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Welch</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mihalcea</surname><given-names>R</given-names> </name><name name-style="western"><surname>P&#x00E9;rez-Rosas</surname><given-names>V</given-names> </name></person-group><article-title>Counseling-style reflection generation using generative pretrained transformers with augmented context</article-title><access-date>2023-08-06</access-date><conf-name>21th Annual Meeting of the Special Interest Group on Discourse and Dialogue</conf-name><conf-date>Jul 1-3, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2020.sigdial-1">https://aclanthology.org/2020.sigdial-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2020.sigdial-1.2</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>Readiness Ruler</article-title><source>Case Western Reserve University</source><year>2010</year><access-date>2023-08-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://case.edu/socialwork/centerforebp/resources/readiness-ruler">https://case.edu/socialwork/centerforebp/resources/readiness-ruler</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mercer</surname><given-names>SW</given-names> </name><name name-style="western"><surname>Maxwell</surname><given-names>M</given-names> </name><name name-style="western"><surname>Heaney</surname><given-names>D</given-names> </name><name name-style="western"><surname>Watt</surname><given-names>GC</given-names> </name></person-group><article-title>The Consultation and Relational Empathy (CARE) measure: development and preliminary validation and reliability of an empathy-based consultation process measure</article-title><source>Fam Pract</source><year>2004</year><month>12</month><volume>21</volume><issue>6</issue><fpage>699</fpage><lpage>705</lpage><pub-id pub-id-type="doi">10.1093/fampra/cmh621</pub-id><pub-id pub-id-type="medline">15528286</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McHugh</surname><given-names>ML</given-names> </name></person-group><article-title>Interrater reliability: the kappa statistic</article-title><source>Biochem Med (Zagreb)</source><year>2012</year><volume>22</volume><issue>3</issue><fpage>276</fpage><lpage>282</lpage><pub-id pub-id-type="medline">23092060</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Example of a MIBot v5.1 conversation. Generated reflections are italicized. Reflections marked "mibot (data)" are from the original dataset. Reflections marked "mibot (blcr)" are generated from this paper's prompting method. Both are provided for comparison.</p><media xlink:href="mental_v11i1e53778_app1.docx" xlink:title="DOCX File, 6 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Frequency of backward-looking complex reflection rating score by question.</p><media xlink:href="mental_v11i1e53778_app2.png" xlink:title="PNG File, 20 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Frequency of backward-looking complex reflection rating score by rater.</p><media xlink:href="mental_v11i1e53778_app3.png" xlink:title="PNG File, 19 KB"/></supplementary-material></app-group></back></article>