<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">JMIR Neurotech</journal-id><journal-id journal-id-type="publisher-id">neuro</journal-id><journal-id journal-id-type="index">42</journal-id><journal-title>JMIR Neurotechnology</journal-title><abbrev-journal-title>JMIR Neurotech</abbrev-journal-title><issn pub-type="epub">2817-092X</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v5i1e69708</article-id><article-id pub-id-type="doi">10.2196/69708</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Diagnostic Accuracy of GPT-4 With Vision in Neuroradiology Board-Style Exam Questions: Cross-Sectional Case-Based Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Sussan</surname><given-names>Tom T</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Brawley</surname><given-names>Rebekah R</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Eckroth</surname><given-names>Joshua</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Mossell</surname><given-names>James E</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Weitao</surname><given-names>Tao</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Lake Erie College of Medicine</institution><addr-line>5000 Lakewood Ranch Blvd.</addr-line><addr-line>Bradenton</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Computer Science, Stetson University</institution><addr-line>Deland</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff3"><institution>College of Medicine, University of Central Florida</institution><addr-line>Orlando</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff4"><institution>Department of Radiology, University of Florida College of Medicine</institution><addr-line>Gainesville</addr-line><addr-line>FL</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Brini</surname><given-names>Stefano</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Liu</surname><given-names>Fuxiao</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Fitzek</surname><given-names>Sebastian</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zawada</surname><given-names>Stephanie</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chen</surname><given-names>Sully</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kancharla</surname><given-names>Venkateswara Siva Kishore</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Tao Weitao, MD, PhD, Lake Erie College of Medicine, 5000 Lakewood Ranch Blvd., Bradenton, FL, 34211, United States, 1 9417825761; <email>twei@lecom.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>all authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>30</day><month>4</month><year>2026</year></pub-date><volume>5</volume><elocation-id>e69708</elocation-id><history><date date-type="received"><day>05</day><month>12</month><year>2024</year></date><date date-type="rev-recd"><day>20</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>21</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Tom T Sussan, Rebekah R Brawley, Joshua Eckroth, James E Mossell, Tao Weitao. Originally published in JMIR Neurotechnology (<ext-link ext-link-type="uri" xlink:href="https://neuro.jmir.org">https://neuro.jmir.org</ext-link>), 30.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in JMIR Neurotechnology, is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://neuro.jmir.org">https://neuro.jmir.org</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://neuro.jmir.org/2026/1/e69708"/><abstract><sec><title>Background</title><p>Multimodal artificial intelligence systems combining text and image analysis represent a paradigm shift in clinical decision support. While GPT-4 with Vision (GPT-4V) has shown promise in medical imaging interpretation, existing studies report inconsistent performance (16%&#x2010;80% accuracy) across radiological subspecialties. Critical knowledge gaps persist regarding GPT-4V&#x2019;s capability to integrate clinical history with imaging findings in complex neuroradiology scenarios, and fundamental questions remain about whether the model appropriately balances visual and textual information sources when formulating diagnoses. Furthermore, documented artificial intelligence hallucination rates of 35.5% to 63% in radiology applications raise urgent safety concerns, yet the relationship between modality utilization patterns and diagnostic accuracy remains unexplored.</p></sec><sec><title>Objective</title><p>This study aims to evaluate GPT-4V&#x2019;s diagnostic accuracy on expert-validated neuroradiology board-style examination questions and to examine the model&#x2019;s self-reported reliance on imaging versus clinical text data when making diagnostic decisions. A secondary objective was to examine whether self-characterized modality utilization patterns differed systematically between correct and incorrect diagnoses, potentially identifying specific failure modes requiring targeted mitigation strategies.</p></sec><sec sec-type="methods"><title>Methods</title><p>This cross-sectional study evaluated GPT-4V using 29 neuroradiology cases from the RSNA (Radiological Society of North America) Case Collection, covering adult brain and central nervous system pathologies imaged via computed tomography or magnetic resonance imaging. The cases were authored by board-certified radiologists. GPT-4V was accessed via ChatGPT Plus (July 2024) with standardized prompts selecting 1 answer from 4 options, providing diagnostic rationale, and quantifying the percentage contributions of image versus text data. Binary scoring assessed diagnostic performance (correct=1, incorrect=0). Statistical analysis included Wilson score CIs, a binomial test comparing accuracy to chance, and a 2-tailed <italic>t</italic> test comparing self-reported modality reliance between correct and incorrect diagnoses (&#x03B1;=.05, Cohen <italic>d</italic> calculated).</p></sec><sec sec-type="results"><title>Results</title><p>GPT-4V correctly diagnosed 22 of 29 cases (76% accuracy, 95% CI 57.9%-87.8%), significantly exceeding the chance performance of 25% (<italic>z</italic>=6.33; <italic>P</italic>&#x003C;<italic>.</italic>001). The model self-reported mean contributions of 66.1% from imaging (95% CI 63.5%&#x2010;68.8%) and 33.9% from text (95% CI 31.2%&#x2010;36.5%). Correct diagnoses (n=22) showed significantly lower self-reported image reliance (62.8%, 95% CI 61.3%&#x2010;64.3%) compared to incorrect diagnoses (n=7; 76.7%, 95% CI 73.5%&#x2010;80.0%), with a mean difference of 13.9 percentage points (95% CI 10.6&#x2010;17.3; <italic>P</italic>&#x003C;.001; Cohen <italic>d</italic>=4.08, 95% CI 2.73&#x2010;5.43). All 7 incorrect diagnoses demonstrated image-dominant attribution &#x2265;70% (Fisher exact test <italic>P</italic>&#x003C;<italic>.</italic>001), suggesting that excessive visual reliance may indicate diagnostic risk.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The 76% accuracy substantially exceeds prior GPT-4V radiology studies (43%), demonstrating that focused domain application with structured prompting enhances performance. Incorrect diagnoses are associated with higher self-reported visual reliance, suggesting a potential failure mode warranting experimental validation. This pattern identifies a potentially actionable signal for quality assurance systems. Clinical deployment should remain restricted to supervised educational applications with mandatory radiologist oversight until balanced context-aware integration is validated.</p></sec></abstract><kwd-group><kwd>neuroradiology</kwd><kwd>GPT-4 with Vision</kwd><kwd>GPT-4V</kwd><kwd>artificial intelligence</kwd><kwd>diagnostic accuracy</kwd><kwd>multimodal AI</kwd><kwd>medical imaging</kwd><kwd>clinical decision-making</kwd><kwd>text-image integration</kwd><kwd>board exam questions</kwd><kwd>RSNA Case Collection</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>The advent of multimodal artificial intelligence (AI) systems represents a transformative shift in medical diagnostics, particularly in radiology, where clinical decision-making requires integrated analysis of imaging findings and clinical context. Multimodal AI models combine diverse data modalities, such as imaging, text, structured records, and physiological signals, into unified analytical frameworks [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Recent advancements in transformer architectures and foundation models have enabled unprecedented capabilities in processing heterogeneous medical data [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>], with AI adoption in radiology accelerating rapidly in recent years [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>OpenAI&#x2019;s GPT-4 with Vision (GPT-4V), released in 2023, exemplifies this multimodal paradigm by enabling the simultaneous interpretation of text and images. Large language models (LLMs) have demonstrated utility in radiology report generation, board exam preparation, and clinical decision support [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref11">11</xref>], with studies reporting significant improvements in efficiency and consistency [<xref ref-type="bibr" rid="ref7">7</xref>]. However, the addition of visual integration has yielded contradictory performance patterns across radiological subspecialties, raising fundamental questions about how these models process and integrate information from different modalities.</p></sec><sec id="s1-2"><title>Current Evidence and Critical Knowledge Gaps</title><p>Empirical evaluations reveal substantial heterogeneity in GPT-4V diagnostic accuracy. Huppertz et al [<xref ref-type="bibr" rid="ref12">12</xref>] demonstrated that diagnostic accuracy improved from 8.3% with images alone to 29.1% with contextualized prompts, though the model exhibited pronounced context bias and frequent fabricated findings, with similar concerns documented in other multimodal evaluations [<xref ref-type="bibr" rid="ref13">13</xref>]. Studies report GPT-4V accuracy ranging from 16% to 49% in challenging radiology cases (characterized by rare pathologies, subtle findings, or complex differentials), consistently below trained radiologists&#x2019; performance [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref16">16</xref>]. Albaqshi et al [<xref ref-type="bibr" rid="ref17">17</xref>] found that among 6 LLMs evaluated on 56 neuroradiology cases, Claude 3.5 achieved the highest accuracy (80.4%), with LLMs performing comparably to first-year fellows while showing high consistency across repeated queries. Systematic reviews confirm variable results (16%&#x2010;80% accuracy) depending on case difficulty, prompt engineering, and domain specificity [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>Fundamental questions persist about how GPT-4V integrates visual and textual information. Multiple studies document limited visual interpretation capabilities: Schramm et al [<xref ref-type="bibr" rid="ref20">20</xref>] identified textual descriptions as the strongest contributor to performance, while Albaqshi et al [<xref ref-type="bibr" rid="ref17">17</xref>] demonstrated that image-only accuracy plummeted to 21.5% to 63.1% compared to 62.5% to 76.8% with combined inputs. Conversely, some studies show GPT-4V&#x2019;s superiority over text-only approaches [<xref ref-type="bibr" rid="ref21">21</xref>], while others report text-only models outperforming multimodal implementations [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. These findings suggest a critical paradox: adding visual capabilities may not enhance but can potentially degrade diagnostic performance when multimodal integration is suboptimal.</p><p>A critical barrier to clinical deployment is AI hallucinations, the plausible but incorrect information that appears factually grounded [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref26">26</xref>]. LLM hallucinations in medical contexts remain a critical concern [<xref ref-type="bibr" rid="ref7">7</xref>], manifesting as fabricated findings or misidentified modalities [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Jin et al [<xref ref-type="bibr" rid="ref24">24</xref>] documented &#x201C;hidden flaws behind expert-level accuracy,&#x201D; revealing systematic errors obscured by superficially correct outputs. Current literature lacks systematic investigation of whether diagnostic failures correlate with specific modality utilization patterns. Understanding these patterns is essential for safe deployment, as systematic overreliance on either modality could lead to predictable failure modes requiring targeted mitigation.</p><p>Current multimodal foundation models exhibit limitations precluding autonomous diagnostic use: inconsistent results across identical inputs, tendency toward confabulation [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>], sensitivity to prompt engineering [<xref ref-type="bibr" rid="ref28">28</xref>], lack of transparency [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], and variable performance across modalities and anatomical regions [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. Recent position statements emphasize that AI integration must prioritize human-AI collaboration frameworks, transparent uncertainty quantification, and mandatory expert oversight [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. Despite growing literature on diagnostic accuracy, critical gaps remain regarding (1) how multimodal AI characterizes its reliance on visual versus textual inputs, (2) whether modality attribution patterns differ between correct and incorrect diagnoses, and (3) whether self-reported information utilization reflects actual processing versus post hoc rationalization [<xref ref-type="bibr" rid="ref17">17</xref>].</p></sec><sec id="s1-3"><title>Study Objectives</title><p>This study addressed two objectives: (1) to evaluate GPT-4V&#x2019;s diagnostic accuracy on expert-validated neuroradiology board-style questions, providing benchmark performance data under standardized conditions and (2) as an exploratory analysis to document GPT-4V&#x2019;s self-reported reliance on imaging versus clinical text and examine whether self-characterized modality utilization patterns differ between correct and incorrect diagnoses. We acknowledge that determining whether these self-assessments reflect actual information processing versus post hoc rationalization requires rigorous experimental validation through controlled text-only and image-only conditions.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design and Data Source</title><p>This cross-sectional study, reported according to JARS-Quant guidelines [<xref ref-type="bibr" rid="ref31">31</xref>], evaluated GPT-4V&#x2019;s diagnostic accuracy using 29 neuroradiology cases from the RSNA (Radiological Society of North America) Case Collection. The cases included adult brain and central nervous system pathologies imaged via computed tomography (CT) or magnetic resonance imaging (MRI; Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, Figure S5.1 and Table S2.1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, and Table S6.1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). Each case included a clinical vignette and diagnostic-quality imaging studies (Figures S2 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> and Figure S3 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>, respectively). The inclusion criteria required expert-verified diagnoses in multiple-choice format; the cases were authored by board-certified radiologists and underwent editorial review following established quality standards for educational radiology assessments [<xref ref-type="bibr" rid="ref32">32</xref>], analogous to standardized board examination validation. Cases were excluded if they involved pediatric patients, lacked diagnostic images, or had no definitive correct answer.</p><p>While the RSNA Case Collection&#x2019;s restricted membership access reduces the likelihood of training data contamination, we acknowledge that with closed-source models, data leakage cannot be definitively ruled out and could artificially inflate performance estimates. All case materials were deidentified and used with permission. The cases were accessed in July 2024.</p></sec><sec id="s2-2"><title>Assessment and Scoring Methodology</title><p>Complete prompt structure, standardized instructions, and example responses are documented in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> (Parts A-E), ensuring reproducibility. Binary scoring evaluated diagnostic performance: correct (score=1) if the model&#x2019;s answer matched the peer-reviewed correct diagnosis from RSNA documentation; incorrect (score=0) otherwise (Table S6.1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> and Table S7.1 and Section 8.1 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). All cases underwent peer review and editorial vetting by the RSNA&#x2019;s editorial board prior to publication (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). No partial credit was given. Overall accuracy was calculated as percentage correct out of 29 cases. As exploratory measures, we recorded self-assessed percentage influence of image versus text for each case, emphasizing that these represent subjective self-reports rather than validated measurements of actual information contribution (Table S7.2 and Section 8.1 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>).</p></sec><sec id="s2-3"><title>Ethical Considerations</title><p>This study did not constitute human subjects research as defined by US Department of Health and Human Services regulations at 45 CFR 46.102(e) and (l) [<xref ref-type="bibr" rid="ref33">33</xref>]. The study involved secondary analysis of fully deidentified educational case materials from the RSNA Case Collection, accessed through authorized membership. No living individuals were contacted, and no identifiable private information was obtained, used, or generated. The RSNA Case Collection requires authors to remove all patient identifiers prior to submission (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). All figures and multimedia appendices in this study contain only fully deidentified radiological images and clinical information from the RSNA Case Collection, with no possibility of individual identification.</p></sec><sec id="s2-4"><title>Missing Data Analysis</title><p>All 29 cases included in the analysis had complete data for the primary outcome (diagnostic accuracy; Table S5.1 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>). Each case successfully elicited a diagnostic response from GPT-4V, with the model selecting 1 of 4 answer options in all instances. For the exploratory modality attribution measures, the model provided self-reported percentage contributions (image vs text) for all 29 cases, resulting in zero missing data for both primary and exploratory outcome measures (Figure S5.1 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref> and Part C in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). Therefore, no imputation procedures or missing data analyses were necessary. The completeness of data reflects the controlled nature of the study design, where GPT-4V was systematically prompted to provide both diagnostic answers and modality attribution percentages for each case. One trial per case was conducted with standardized prompts designed to elicit complete responses. The single-trial design means that response variability across multiple trials was not assessed. GPT-4V&#x2019;s temperature setting and stochastic sampling could produce different responses on repeated trials; this variability is addressed in the <italic>Limitations</italic> section. The study protocol specified that any case with incomplete model responses would be excluded and reported as a protocol deviation. This scenario did not occur.</p></sec><sec id="s2-5"><title>Statistical Considerations</title><p>The sample size (N=29) was determined by available cases meeting the inclusion criteria (Tables S3.1 and S3.2 in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref> document post hoc power &#x003E;99.9% for primary analysis, 97% for exploratory analysis, and &#x00B1;16.3 percentage point margin of error). Statistical significance was assessed using a 2-sided alpha level of .05 for all hypothesis tests. Statistical analysis was primarily descriptive (Tables S6.1-S6.2 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). Statistical analysis used a 2-sample 2-tailed <italic>t</italic> test with standard error&#x2013;based CIs for continuous variables; Wilson score method was applied separately for the diagnostic accuracy proportion (Table S4.2 in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref> presents complete 2-tailed <italic>t</italic> test results: <italic>t</italic><sub>27</sub>=9.40; <italic>P&#x003C;.</italic>001, Cohen <italic>d</italic>=4.08; Tables S5.3-S5.5 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref> verify assumptions, including normality and equal variances, and provide comprehensive descriptive statistics). We compared the findings qualitatively to previous studies [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>A 1-sample binomial test assessed whether diagnostic accuracy exceeded random guessing (25% for 4-option questions, <italic>z</italic>=6.33, <italic>P&#x003C;.</italic>001; Table S4.1 in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>). CIs for proportions were calculated using the Wilson score method. For modality weighting, 95% CIs were calculated using the <italic>t</italic> distribution. A 2-sample 2-tailed <italic>t</italic> test compared self-reported image reliance between correct and incorrect cases. Effect sizes (Cohen <italic>d</italic>) with 95% CIs are reported to allow readers to interpret clinical and statistical significance (Table S4.3 in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref> shows all incorrect diagnoses demonstrated image-dominant attribution &#x2265;70%, Fisher exact test <italic>P&#x003C;.</italic>001; Table S5.2 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref> confirms no statistical outliers; Tables S7.1 and S7.2 and Section S8.2 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref> document variable definitions and derived measures). Complete statistical methods are detailed in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>.</p><p><xref ref-type="fig" rid="figure1">Figure 1</xref> is the systematic methodology for evaluating GPT-4V diagnostic performance on 29 neuroradiology cases from the RSNA Case Collection. The workflow includes (1) data sources&#x2014;cases containing CT or MRI scans of adult brain and central nervous system pathologies, clinical vignettes, and peer-reviewed multiple-choice questions; (2) standardized prompt structure&#x2014;a consistent template instructing GPT-4V to review all radiographic imaging, select 1 diagnostic answer from 4 options, provide diagnostic rationale, and quantify the percentage contribution of visual versus textual information to its diagnostic decision (exploratory outcome measure); and (3) ChatGPT Plus implementation&#x2014;prompt delivery via ChatGPT Plus web interface. This structured methodology ensures standardized evaluation while systematically capturing the model&#x2019;s self-reported reliance on visual versus textual information sources. One trial per case was conducted without iterative prompting to simulate real-world clinical conditions where single diagnostic assessments are typical.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Prompt engineering workflow for GPT-4 with Vision (GPT-4V) evaluation in neuroradiology board-style questions: a cross-sectional study of 29 adult brain and central nervous system pathologies from the RSNA Case Collection (July 2024). API: application programming interface; CT: computed tomography; MRI: magnetic resonance imaging.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="neuro_v5i1e69708_fig01.png"/></fig><p><xref ref-type="fig" rid="figure2">Figure 2</xref> is a representative infectious disease case (Table S6.1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>, Case #14: Neurocysticercosis) from the RSNA Case Collection, 1 of 29 adult brain and central nervous system pathology cases (Figure S5.1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>, Table S6.1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>) used in this July 2024 cross-sectional evaluation (Figure S5.1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) of GPT-4V diagnostic accuracy. This case features a 32-year-old male presenting with first-time seizure (complete case vignette in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>) and includes (1) a clinical vignette containing patient demographics, symptoms, and history (textual information) and (2) diagnostic-quality neuroimaging studies in PNG format (image format not documented in appendices) obtained via CT or MRI (visual information; <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref> and Figure S5.1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). GPT-4V was required to integrate both clinical context and imaging findings (Part A: standardized prompt structure in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>) to select the correct diagnosis from 4 multiple-choice options (Part A in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> and Answer Choices A-D in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>), with self-reported percentage contributions from each modality recorded as an exploratory measure (Parts A-B in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>; Table S6.1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> confirms 65% image, 35% text attribution for this case).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Representative infectious disease neuroradiology case from the RSNA Case Collection demonstrating clinical context integration in GPT-4 with Vision (GPT-4V) multimodal diagnostic decision-making.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="neuro_v5i1e69708_fig02.png"/></fig><p><xref ref-type="fig" rid="figure3">Figure 3</xref> is a complex neuroradiology case (Table S6.1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>, Case #19: Syntelencephaly) from this July 2024 cross-sectional study (Figure S5.1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) evaluating GPT-4V&#x2019;s diagnostic performance on 29 adult brain and central nervous system pathology cases from the RSNA Case Collection. This case involves a 31-year-old male patient with developmental brain abnormalities and seizures (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>: complete case presentation with clinical vignette, developmental history). The case structure provided to GPT-4V included (1) clinical vignette describing patient demographics, symptoms (seizures), developmental history, and relevant neurological findings (textual data) and (2) complete series of diagnostic-quality neuroimaging studies obtained via CT or MRI in standardized PNG format (image format not documented); visual data (Part A in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> and <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). This case exemplifies challenging diagnostic scenarios where developmental malformations present subtle imaging findings requiring expert-level integration of both clinical context and radiological interpretation (Table S6.1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> confirms correct diagnosis; Table S2.1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> shows Developmental category: 100% accuracy).</p><p>The mean self-reported modality contributions across diagnostic outcomes for 29 neuroradiology cases from the RSNA Case Collection are presented. The data are shown for all cases (N=29), correct diagnoses (n=22), and incorrect diagnoses (n=7). Image contributions are normalized to 1.0 (blue bars) to enable comparison across categories; text contributions appear as ratios (purple bars). Error bars represent 95% CIs for text:image ratios. Incorrect diagnoses showed significantly higher self-reported image reliance (76.7%) compared to correct diagnoses (62.8%), with a mean difference of 13.9 percentage points (<italic>P&#x003C;.</italic>001; Cohen <italic>d</italic>=4.08; Tables S4.2-S4.3 in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Representative developmental brain malformation case highlighting complex multimodal integration requirements for accurate artificial intelligence (AI) diagnosis.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="neuro_v5i1e69708_fig03.png"/></fig></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview of Primary and Exploratory Findings</title><p>Our primary finding is that GPT-4V achieved 76% diagnostic accuracy (22 out of 29 correct diagnoses) on expert-validated neuroradiology cases, significantly exceeding chance performance. Secondary exploratory findings regarding self-reported modality utilization should be interpreted cautiously, as they represent the model&#x2019;s characterization of its process rather than validated measurements of actual information use.</p></sec><sec id="s3-2"><title>Diagnostic Performance</title><p>GPT-4V correctly diagnosed 22 out of 29 neuroradiology cases, yielding 76% accuracy (95% CI 57.9%&#x2010;87.8% by Wilson method), significantly above the 25% expected by chance (<italic>z</italic>=6.33; <italic>P&#x003C;.</italic>001; Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, Table S4.1 in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>, and Table S6.1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). This exceeds the 43% accuracy (31/72 cases) reported by Mukherjee et al [<xref ref-type="bibr" rid="ref34">34</xref>] for GPT-4V on RSNA &#x201C;Case of the Day&#x201D; challenges, suggesting that within focused domains under structured prompting, GPT-4V&#x2019;s performance can be enhanced. The multiple-choice format (Part A in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> and <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>) may have aided performance by providing plausible options rather than requiring open-ended diagnosis generation. However, we acknowledge that data leakage cannot be definitively ruled out with closed-source models, and any training data contamination could have contributed to this performance.</p></sec><sec id="s3-3"><title>Multimodal Data Integration Patterns</title><p>As an exploratory analysis, we examined GPT-4V&#x2019;s self-reported percentage contributions of visual versus textual information to diagnostic decisions. The model reported that image data contributed 66% (95% CI 63.5%&#x2010;68.8%) and textual data 34% (95% CI 31.2%&#x2010;36.5%) on average (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and Table S5.5 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>). The overall text:image ratio was 0.53; however, this ratio varied by diagnostic outcome: correct diagnoses showed 0.60 versus incorrect diagnoses at 0.31 (<xref ref-type="fig" rid="figure4">Figure 4</xref>; Table S4.2 in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>), suggesting that errors may be associated with self-characterized overreliance on imaging (Table S4.3 in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref> shows that 100% of incorrect diagnoses had image-dominant attribution &#x2265;70%, Fisher exact <italic>P&#x003C;.</italic>001; Table S6.1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> provides case-level data). Whether this reflects actual information processing or post hoc rationalization requires rigorous experimental validation through controlled text-only and image-only conditions (Section 8.1 in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref> documents interpretation limitations and validation requirements).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Self-reported image and text contributions by diagnostic outcome in GPT-4V neuroradiology evaluation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="neuro_v5i1e69708_fig04.png"/></fig></sec><sec id="s3-4"><title>Comparative Analysis</title><p>Our 76% accuracy (95% CI 57.9%-87.8%; Table S4.1 in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref> and Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) substantially exceeds the 43% from Mukherjee et al [<xref ref-type="bibr" rid="ref34">34</xref>], who found that GPT-4V performed worse on imaging-dependent cases (39%) than on text-only inputs (50%), suggesting heavy reliance on textual information. Our focused neuroradiology approach (Table S6.1 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> and Table S2.1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) with multiple-choice format (Part A in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> and <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>) was associated with more robust performance, though differences in case difficulty between the RSNA Case Collection and Annual Meeting case sets may also contribute. Domain-specific factors and question format appear to influence success.</p><p>Our findings contrast with those of Hirosawa et al [<xref ref-type="bibr" rid="ref22">22</xref>], who reported that GPT-4V attributed only 30% of decisions to image data on general case reports, with text-only GPT-4 outperforming GPT-4V (55.9% vs 44.4%). Our substantially higher image utilization (66%; 95% CI 63.5%-68.8%; Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and Table S5.5 in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>) may reflect our explicit prompting to consider and quantify image information (Part A in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> documents modality quantification instructions) or the imaging-centric nature of neuroradiology cases compared to general medical case reports. These results suggest that GPT-4V makes extensive use of visual data, but improved outcomes depend on accurate image interpretation within the clinical context.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Summary of Main Findings</title><p>This study addressed 2 primary objectives: evaluating GPT-4V&#x2019;s diagnostic accuracy on expert-validated neuroradiology board-style questions and exploring self-reported reliance on imaging versus clinical text during diagnostic decision-making. Both objectives were successfully addressed. GPT-4V achieved 76% diagnostic accuracy on expert-validated neuroradiology cases, substantially exceeding prior performance on RSNA materials (76% vs 43% from Mukherjee et al [<xref ref-type="bibr" rid="ref34">34</xref>] on case of the day challenges), though differences in case selection and difficulty limit direct comparison. Exploratory analysis revealed an inverse relationship: incorrect diagnoses were associated with higher self-reported visual reliance compared to correct diagnoses. While these represent the model&#x2019;s self-characterization rather than validated measurements, this pattern generates testable hypotheses about multimodal integration failure modes. This is among the first studies systematically documenting how multimodal AI self-reports reliance on different information sources during clinical diagnosis.</p></sec><sec id="s4-2"><title>Interpretation and Comparison to Literature</title><p>Our findings contribute to emerging evidence regarding multimodal AI diagnostic accuracy and information integration patterns. Superior performance compared to prior studies [<xref ref-type="bibr" rid="ref34">34</xref>] demonstrates that diagnostic accuracy depends critically on domain specificity, question format, and prompt engineering, all of which suggest that performance cannot be characterized by single global metrics but varies substantially based on the implementation approach.</p><p>The exploratory finding regarding modality attribution, where diagnostic errors were associated with higher self-reported image reliance, aligns with multiple studies. Schramm et al [<xref ref-type="bibr" rid="ref20">20</xref>] identified textual descriptions as the strongest contributor to performance, while Hayden et al [<xref ref-type="bibr" rid="ref16">16</xref>] found that GPT-4V performed significantly worse on image-based questions (47.8%) compared to text-only questions (81.5%). Albaqshi et al [<xref ref-type="bibr" rid="ref17">17</xref>] demonstrated that image-only accuracy plummeted compared to text-with-image performance. Our pattern may reflect a failure mode in which the model attempts to extract diagnostic information primarily from visual data despite limited visual interpretation capabilities, thereby neglecting critical clinical context that might correct misinterpretations.</p><p>This image-dominant failure pattern warrants deeper mechanistic consideration. Incorrect diagnoses showed substantially higher self-reported image reliance compared to correct diagnoses, with a very large effect size, suggesting this is not merely statistical noise but a consistent pattern in self-reported attribution that determines whether this reflects actual information processing or post hoc rationalization. The fact that all incorrect diagnoses demonstrated image-dominant attribution patterns is particularly striking and suggests a potential &#x201C;tipping point&#x201D; beyond which diagnostic accuracy deteriorates markedly.</p><p>Several mechanisms could explain this pattern, though experimental validation is required to test these hypotheses. First, if visual processing capabilities lack domain-specific fine-grained discrimination for subtle radiological findings, excessive reliance on visual inputs might lead to confident but incorrect diagnoses. Second, the insufficient integration of clinical context could contribute to diagnostic errors. Third, architectural factors in multimodal integration remain unexplored and warrant investigation through controlled experiments with systematic input manipulation. These proposed mechanisms are speculative and require rigorous testing through image-only, text-only, and combined conditions to validate whether self-reported attribution patterns reflect actual information processing.</p><p>Comparison with human diagnostic patterns is instructive. Experienced radiologists typically use iterative hypothesis refinement, beginning with clinical context to generate differential diagnoses, and then using imaging to confirm or refute specific possibilities [<xref ref-type="bibr" rid="ref30">30</xref>]. This approach naturally balances modalities by forcing explicit integration. In contrast, GPT-4V may process visual and textual streams more independently, with final outputs reflecting whichever stream activates more strongly rather than true synthesis. Potential architectural or integration limitation could explain why adding visual capabilities sometimes degrades rather than enhances performance [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>], though our observational data cannot establish this mechanism.</p><p>Our finding of higher overall image utilization compared to some studies [<xref ref-type="bibr" rid="ref22">22</xref>] may reflect explicit prompting to quantify image contribution or the imaging-centric nature of neuroradiology cases. However, the consistency of the image-dominant failure pattern across diverse pathology categories within neuroradiology (Table S6.2 in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>) suggests this is not merely an artifact of case selection within this domain. Whether this pattern generalizes to other radiological subspecialties or medical domains requires investigation in diverse clinical contexts. Our observed overall image utilization, while seemingly reasonable, may actually be excessive given that textual clinical information often carries disproportionate diagnostic weight relative to its volume.</p><p>That Busch et al [<xref ref-type="bibr" rid="ref21">21</xref>] demonstrated GPT-4V&#x2019;s superiority over text-only approaches in some tasks suggests the relationship between modality contribution and diagnostic success is task-dependent and complex. This task dependency likely reflects varying degrees of diagnostic specificity achievable through visual inspection alone versus requiring clinical correlation. For conditions with pathognomonic imaging features (eg, calcified subependymal nodules in tuberous sclerosis), visual dominance may succeed. For conditions requiring clinical-radiological synthesis (eg, distinguishing demyelination patterns based on temporal profile), balanced integration becomes essential. Our results suggest that GPT-4V may not appropriately adjust modality weighting for different diagnostic scenarios, though whether this reflects limitations in actual processing versus self-assessment requires validation through controlled experiments.</p><p>The broader implications extend to fundamental questions about multimodal AI architecture. Current vision-language models typically use late fusion, where separate encoders process each modality before combining representations [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. This approach, while computationally efficient, may fail to capture complex cross-modal dependencies essential for medical reasoning [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. Early fusion architectures that enable deeper integration from initial processing stages, or attention mechanisms explicitly trained to modulate cross-modal influence based on task demands, may better support the dynamic modality balancing that expert diagnosis requires. Our finding that incorrect diagnoses systematically show imbalanced modality utilization provides empirical motivation for such architectural innovations.</p></sec><sec id="s4-3"><title>Clinical Safety and Deployment Implications</title><p>Examination of incorrect responses revealed 2 critical failure patterns that have distinct clinical implications. First, the model frequently generated hallucinated rationales citing nonexistent findings [<xref ref-type="bibr" rid="ref24">24</xref>], consistent with documented hallucination rates of 35.5% to 63% in GPT-4V radiology applications [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. Second, some errors reflected overemphasis on prominent visual findings while neglecting subtle clinical context, demonstrating that visual misinterpretations can lead the model astray when clinical information is insufficiently weighted.</p><p>These failure patterns carry distinct clinical risks requiring targeted mitigation strategies. Hallucinated findings are particularly dangerous because they appear authoritative and specific, potentially misleading clinicians who may not independently verify each claimed observation. In our study, hallucinations included references to imaging features not present in the provided images, incorrect anatomical localizations, and fabricated quantitative measurements. Such errors could lead to unnecessary interventions, incorrect diagnoses being entered into medical records, or delayed recognition of actual pathology.</p><p>The image-dominant failure mode presents a different risk profile. By over-weighting visual information that it cannot accurately interpret, GPT-4V may generate diagnoses that superficially align with prominent imaging features while missing the correct diagnosis that clinical context would suggest. This pattern is especially concerning in cases where imaging findings are nonspecific, but clinical history is highly discriminating. For example, ring-enhancing lesions have broad differential diagnoses, but patient age, immune status, and geographic location dramatically narrow possibilities [<xref ref-type="bibr" rid="ref39">39</xref>]. A system that overrelies on imaging might suggest common etiologies based on visual appearance while missing the correct diagnosis apparent from clinical context.</p><p>These limitations mandate restricted deployment. GPT-4V should be implemented only as an educational tool or decision-support aid that highlights findings for human review but never as an autonomous diagnostic system. Any radiological application must include mandatory radiologist oversight, with AI output supplementing rather than replacing expert, as emphasized in multisociety professional guidelines [<xref ref-type="bibr" rid="ref40">40</xref>-<xref ref-type="bibr" rid="ref42">42</xref>]. Institutional protocols should explicitly prohibit applications bypassing human review. These restrictions remain necessary until multimodal integration capabilities achieve consistent, balanced utilization of both clinical and imaging information.</p><p>Specific implementation guidelines should include (1) interface design that presents AI outputs as preliminary suggestions explicitly requiring verification rather than definitive conclusions [<xref ref-type="bibr" rid="ref43">43</xref>-<xref ref-type="bibr" rid="ref45">45</xref>]; (2) transparent uncertainty quantification, ideally displaying the model&#x2019;s self-reported modality contributions alongside confidence estimates to flag high-risk image-dominant attributions; (3) training programs educating users about characteristic failure modes, particularly the tendency toward hallucinated findings and image-dominant errors; (4) future quality assurance protocols could explore whether AI attribution patterns predict diagnostic errors, though the 70% threshold observed in our small sample requires validation across larger, diverse datasets before clinical implementation; and (5) mandatory documentation of AI involvement in clinical reports to ensure appropriate medicolegal clarity and enable post hoc analysis of AI-associated diagnostic errors.</p><p>Workflow integration must preserve rather than undermine human expertise. Systems should be designed as &#x201C;AI-assisted&#x201D; rather than &#x201C;AI-augmented&#x201D; workflows, maintaining radiologist agency and encouraging critical evaluation. Evidence from other domains suggests that over-reliance on AI recommendations (automation bias) can degrade human performance, particularly when users lack mechanisms to assess AI reliability [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Interfaces should therefore facilitate the easy verification of AI claims, such as by highlighting specific image regions purportedly showing claimed findings, enabling radiologists to quickly confirm or refute visual interpretations.</p><p>Regulatory frameworks must evolve to address multimodal AI&#x2019;s unique challenges. Traditional medical device regulations focus on performance metrics including sensitivity, specificity, and accuracy but may inadequately address systematic failure modes, such as modality-specific overreliance or hallucination propensity [<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref47">47</xref>]. Regulatory approval should require (1) comprehensive characterization of failure modes across diverse clinical scenarios, (2) validation that modality integration patterns align with domain expertise, (3) demonstration of appropriate uncertainty quantification, and (4) postmarket surveillance systems tracking AI-associated diagnostic errors. Our finding that image-dominant attribution predicts errors suggests that regulatory frameworks should incorporate modality balance metrics, potentially flagging deployments where typical attribution patterns diverge substantially from expert norms.</p><p>Educational implications are equally important. Radiology trainees must develop critical AI literacy, understanding both capabilities and characteristic failure modes of multimodal systems [<xref ref-type="bibr" rid="ref48">48</xref>-<xref ref-type="bibr" rid="ref50">50</xref>]. Training should include (1) recognition of hallucinated findings and strategies for systematic verification; (2) awareness that confident AI outputs may reflect overreliance on misinterpreted visual features; (3) skills in integrating AI suggestions with clinical reasoning rather than accepting them uncritically; and (4) understanding of when AI assistance is likely beneficial versus potentially misleading. Paradoxically, effective AI integration may require heightened rather than reduced emphasis on foundational clinical-radiological correlation skills [<xref ref-type="bibr" rid="ref51">51</xref>-<xref ref-type="bibr" rid="ref53">53</xref>], as overreliance on AI tools can diminish core competencies including diagnostic reasoning and clinical pattern recognition.</p><p>Comparison with human diagnostic errors provides important context. Radiologists also commit errors, with estimated miss rates varying by modality and pathology but often ranging from 3% to 5% for routine interpretations to 30% for subtle or complex findings [<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. However, human errors typically differ qualitatively from AI failures. Radiologists rarely hallucinate findings that do not exist; rather, they may overlook subtle abnormalities or misclassify ambiguous features [<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. Human errors often reflect attention limitations, cognitive biases, or knowledge gaps [<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]. These are failure modes with well-established mitigation strategies, such as double-reading, checklists, and continuing education [<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref59">59</xref>]. In contrast, AI hallucinations and systematic modality imbalances represent novel failure modes requiring new quality assurance approaches.</p><p>Our observed accuracy, while exceeding prior GPT-4V studies, remains below expert radiologist performance and insufficient for autonomous deployment. However, the more fundamental concern is not the accuracy level per se but the nature of failures. A system with 76% accuracy that fails randomly might be safely deployable with appropriate oversight, as human review would catch diverse errors. But a system showing systematic failure patterns (like our finding that image-dominant attribution reliably predicts errors) requires more cautious implementation, as certain case types may be systematically mishandled. Future deployment decisions must consider not only overall performance but failure pattern predictability and their alignment with human error patterns.</p><p>Our finding of higher overall image utilization compared to some studies [<xref ref-type="bibr" rid="ref22">22</xref>] may reflect explicit prompting to quantify image contribution or the imaging-centric nature of neuroradiology cases. That Busch et al [<xref ref-type="bibr" rid="ref21">21</xref>] demonstrated GPT-4V&#x2019;s superiority over text-only approaches in some tasks suggests the relationship between modality contribution and diagnostic success is task-dependent and complex.</p></sec><sec id="s4-4"><title>Limitations</title><p>Several important limitations affect the interpretation of our findings. The primary concern is reliance on self-reported attribution of image versus text utilization, which may represent post hoc rationalizations rather than actual information processing. Rigorous validation requires controlled experiments comparing text-only, image-only, and multimodal conditions with information-theoretic metrics, such as mutual information between modalities and diagnostic accuracy.</p><p>The sample size of 29 neuroradiology cases limits statistical power for subgroup analyses and restricts generalizability to other radiological subspecialties. Performance in other subspecialties may differ substantially [<xref ref-type="bibr" rid="ref21">21</xref>], and findings should not be extrapolated beyond adult neuroradiology. Multiple-choice format may overestimate performance relative to free-response clinical scenarios. The absence of ablation controls (text-only or image-only conditions) prevents the quantitative decomposition of relative modality contributions. Despite restricted RSNA access, data leakage cannot be definitively excluded with closed-source models. Finally, narrative justifications may not accurately reflect actual reasoning processes [<xref ref-type="bibr" rid="ref24">24</xref>], limiting confidence in interpreting self-reported modality attribution.</p></sec><sec id="s4-5"><title>Implications</title><p>For AI in radiology, these results highlight the importance of moving beyond simple accuracy metrics toward mechanistic understanding of how multimodal systems process heterogeneous data. Future research should prioritize controlled experimental validation through systematic input manipulation, development of information-theoretic frameworks for quantifying true (rather than self-reported) modality contributions, and standardized test sets with confirmed provenance postdating model training to definitively address data leakage concerns.</p><p>Technical improvements must focus on enhancing multimodal integration through architectural innovations or specialized training that forces explicit cross-referencing of visual and textual features. Interface design should evolve beyond simply adding AI outputs to workflows; instead, it enables systems to express uncertainty transparently, highlight specific image regions, and respond to targeted clinician queries. Domain specialization through fine-tuning on curated radiology datasets remains essential, as general-purpose models exhibit variable performance across subspecialties.</p><p>Most critically, broader implications extend to establishing evidence-based frameworks for human-AI collaboration in clinical medicine. Current multimodal AI systems show promise as educational tools and decision-support aids but remain inappropriate for autonomous diagnostic applications. The field must resist premature deployment driven by technological enthusiasm, instead insisting on rigorous validation of both diagnostic accuracy and decision-making transparency. With continued technological advancement focused on balanced, context-aware data integration and systematic evaluation methodologies, future generations of multimodal AI may achieve robust, reliable performance necessary for meaningful contribution to radiologic practice and patient care.</p></sec><sec id="s4-6"><title>Conclusions: Broader Implications</title><p>This study contributes benchmark performance data and generates testable hypotheses about information integration patterns in diagnostic reasoning. The findings underscore that achieving high diagnostic accuracy requires more than adding visual capabilities to language models but demands sophisticated, balanced integration of clinical context and imaging findings. The exploratory observation that diagnostic failures may associate with imbalanced modality utilization suggests specific failure modes worthy of rigorous experimental investigation.</p><p>GPT-4V achieved 76% diagnostic accuracy on expert-validated neuroradiology cases, substantially exceeding prior GPT-4V performance on RSNA materials (43% by Mukherjee et al [<xref ref-type="bibr" rid="ref34">34</xref>]). This improvement suggests that focused domain application with structured prompting may enhance performance, though experimental studies with controlled manipulation of these factors would be needed to establish causal relationships. However, the novel finding that all incorrect diagnoses associated with image-dominant attribution patterns, with substantially higher visual reliance than correct diagnoses and a very large effect size, identifies a potentially systematic failure mode requiring targeted mitigation. Until multimodal AI systems demonstrate consistent, balanced integration of clinical and imaging information with transparent uncertainty quantification, deployment should remain restricted to supervised educational and decision-support applications with mandatory radiologist oversight.</p><p>With continued technological advancement focused on balanced, context-aware data integration and systematic evaluation methodologies, future generations of multimodal AI may achieve the robust, reliable performance necessary for meaningful contribution to radiologic practice and patient care.</p></sec></sec></body><back><ack><p>We gratefully acknowledge Adli Gates and Isaac Atkinson for their critical reading and meticulous editing of the manuscript. Their thoughtful feedback and detailed review significantly improved the clarity, structure, and overall quality of this work. We also thank the anonymous reviewers for their constructive comments that helped strengthen the methodological rigor and scholarly contribution of this study. This work would not have been possible without LECOM Research and Scholarship support.</p><p>We used ChatGPT and Claude for text proofreading and reference formatting, both of which were reviewed by the authors' team. We used Claude for drawing Table S1 and statistical analyses.</p><p>The authors declare the use of generative AI in the research and writing process. According to the GAIDeT taxonomy (2025), the following tasks were delegated to generative artificial intelligence (GAI) tools under full human supervision: idea generation, formulating research questions and hypotheses, feasibility assessment and risk evaluation, literature search and systematization, writing the literature review, evaluation of the novelty of the research and identification of gaps, development of experimental or research protocols, data collection, data curation and organization, data analysis, visualization, text generation, proofreading and editing, summarizing text, formulation of conclusions, reformatting, quality assessment, trend identification, and identification of limitations.</p><p>The GAI tool used was Claude Sonnet 4.5. Responsibility for the final manuscript lies entirely with the authors. GAI tools are not listed as authors and do not bear responsibility for the final outcomes.</p><p>We used Claude to assist with statistical analysis.</p><p>Declaration submitted by: TW</p></ack><notes><sec><title>Funding</title><p>The authors declared no financial support was received for this work.</p></sec><sec><title>Data Availability</title><p>The data that support the findings of this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CT</term><def><p>computed tomography</p></def></def-item><def-item><term id="abb3">GPT-4V</term><def><p>GPT-4 with Vision</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">MRI</term><def><p>magnetic resonance imaging</p></def></def-item><def-item><term id="abb6">RSNA</term><def><p>Radiological Society of North America</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>As artificial intelligence goes multimodal, medical applications multiply</article-title><source>Science</source><year>2023</year><month>09</month><day>15</day><volume>381</volume><issue>6663</issue><fpage>adk6139</fpage><pub-id pub-id-type="doi">10.1126/science.adk6139</pub-id><pub-id pub-id-type="medline">37708283</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Acosta</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Falcone</surname><given-names>GJ</given-names> </name><name name-style="western"><surname>Rajpurkar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>Multimodal biomedical AI</article-title><source>Nat Med</source><year>2022</year><month>09</month><volume>28</volume><issue>9</issue><fpage>1773</fpage><lpage>1784</lpage><pub-id pub-id-type="doi">10.1038/s41591-022-01981-2</pub-id><pub-id pub-id-type="medline">36109635</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>JO</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>HY</given-names> </name><name name-style="western"><surname>Berzin</surname><given-names>TM</given-names> </name><name name-style="western"><surname>Sodickson</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Rajpurkar</surname><given-names>P</given-names> </name></person-group><article-title>Multimodal generative AI for interpreting 3D medical images and videos</article-title><source>NPJ Digit Med</source><year>2025</year><month>05</month><day>13</day><volume>8</volume><issue>1</issue><fpage>273</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01649-4</pub-id><pub-id pub-id-type="medline">40360694</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simon</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Ozyoruk</surname><given-names>KB</given-names> </name><name name-style="western"><surname>Gelikman</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Harmon</surname><given-names>SA</given-names> </name><name name-style="western"><surname>T&#x00FC;rkbey</surname><given-names>B</given-names> </name></person-group><article-title>The future of multimodal artificial intelligence models for integrating imaging and clinical metadata: a narrative review</article-title><source>Diagn Interv Radiol</source><year>2025</year><month>07</month><day>8</day><volume>31</volume><issue>4</issue><fpage>303</fpage><lpage>312</lpage><pub-id pub-id-type="doi">10.4274/dir.2024.242631</pub-id><pub-id pub-id-type="medline">39354728</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Jensen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yeung-Levy</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lungren</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Poon</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chaudhari</surname><given-names>AS</given-names> </name></person-group><article-title>A systematic review and implementation guidelines of multimodal foundation models in medical imaging</article-title><source>Res Sq</source><comment>Preprint posted online on  Apr 28, 2025</comment><pub-id pub-id-type="doi">10.21203/rs.3.rs-5537908/v1</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yildirim</surname><given-names>N</given-names> </name><name name-style="western"><surname>Richardson</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wetscherek</surname><given-names>MT</given-names> </name><etal/></person-group><article-title>Multimodal healthcare AI: identifying and designing clinically relevant vision-language applications for radiology</article-title><conf-name>CHI &#x2019;24: Proceedings of the 2024 CHI Conference on Human Factors in Computing System</conf-name><conf-date>May 11-16, 2024</conf-date><conf-loc>Honolulu HI, USA</conf-loc><fpage>1</fpage><lpage>22</lpage><pub-id pub-id-type="doi">10.1145/3613904.3642013</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mistry</surname><given-names>NP</given-names> </name><name name-style="western"><surname>Saeed</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rafique</surname><given-names>S</given-names> </name><name name-style="western"><surname>Le</surname><given-names>T</given-names> </name><name name-style="western"><surname>Obaid</surname><given-names>H</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>SJ</given-names> </name></person-group><article-title>Large language models as tools to generate radiology board-style multiple-choice questions</article-title><source>Acad Radiol</source><year>2024</year><month>09</month><volume>31</volume><issue>9</issue><fpage>3872</fpage><lpage>3878</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2024.06.046</pub-id><pub-id pub-id-type="medline">39013736</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sodhi</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Tao</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Seymore</surname><given-names>N</given-names> </name></person-group><article-title>ChatGPT: chasing the storm in radiology training and education</article-title><source>Indian J Radiol Imaging</source><year>2023</year><month>10</month><volume>33</volume><issue>4</issue><fpage>431</fpage><lpage>435</lpage><pub-id pub-id-type="doi">10.1055/s-0043-1774743</pub-id><pub-id pub-id-type="medline">37811181</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Barash</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Assessing GPT-4 multimodal performance in radiological image analysis</article-title><source>Eur Radiol</source><year>2025</year><month>04</month><volume>35</volume><issue>4</issue><fpage>1959</fpage><lpage>1965</lpage><pub-id pub-id-type="doi">10.1007/s00330-024-11035-5</pub-id><pub-id pub-id-type="medline">39214893</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sussan</surname><given-names>TT</given-names> </name><name name-style="western"><surname>Sussan</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Atkinson</surname><given-names>AG</given-names> </name><etal/></person-group><article-title>A comparative evaluation of GPT-4 Turbo and Gemini-Pro in medical licensing exams: enhancing artificial intelligence&#x2019;s role in medical education</article-title><source>Cureus</source><year>2026</year><month>01</month><volume>18</volume><issue>1</issue><fpage>e101101</fpage><pub-id pub-id-type="doi">10.7759/cureus.101101</pub-id><pub-id pub-id-type="medline">41658821</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huppertz</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Siepmann</surname><given-names>R</given-names> </name><name name-style="western"><surname>Topp</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Revolution or risk?&#x2014;Assessing the potential and challenges of GPT-4V in radiologic image interpretation</article-title><source>Eur Radiol</source><year>2025</year><month>03</month><volume>35</volume><issue>3</issue><fpage>1111</fpage><lpage>1121</lpage><pub-id pub-id-type="doi">10.1007/s00330-024-11115-6</pub-id><pub-id pub-id-type="medline">39422726</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kennedy</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Evaluating GPT-4V (GPT-4 with vision) on detection of radiologic findings on chest radiographs</article-title><source>Radiology</source><year>2024</year><month>05</month><volume>311</volume><issue>2</issue><fpage>e233270</fpage><pub-id pub-id-type="doi">10.1148/radiol.233270</pub-id><pub-id pub-id-type="medline">38713028</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Suh</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Shim</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Suh</surname><given-names>CH</given-names> </name><etal/></person-group><article-title>Comparing diagnostic accuracy of radiologists versus GPT-4V and Gemini Pro Vision using image inputs from diagnosis please cases</article-title><source>Radiology</source><year>2024</year><month>07</month><volume>312</volume><issue>1</issue><fpage>e240273</fpage><pub-id pub-id-type="doi">10.1148/radiol.240273</pub-id><pub-id pub-id-type="medline">38980179</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Horiuchi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Tatekawa</surname><given-names>H</given-names> </name><name name-style="western"><surname>Oura</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Comparing the diagnostic performance of GPT-4-based ChatGPT, GPT-4V-based ChatGPT, and radiologists in challenging neuroradiology cases</article-title><source>Clin Neuroradiol</source><year>2024</year><month>12</month><volume>34</volume><issue>4</issue><fpage>779</fpage><lpage>787</lpage><pub-id pub-id-type="doi">10.1007/s00062-024-01426-y</pub-id><pub-id pub-id-type="medline">38806794</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hayden</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>S</given-names> </name><name name-style="western"><surname>Poisson</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Griffith</surname><given-names>B</given-names> </name><name name-style="western"><surname>Klochko</surname><given-names>C</given-names> </name></person-group><article-title>Performance of GPT-4 with vision on text-and image-based ACR diagnostic radiology in-training examination questions</article-title><source>Radiology</source><year>2024</year><month>09</month><volume>312</volume><issue>3</issue><fpage>e240153</fpage><pub-id pub-id-type="doi">10.1148/radiol.240153</pub-id><pub-id pub-id-type="medline">39225605</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Albaqshi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Suh</surname><given-names>CH</given-names> </name><etal/></person-group><article-title>Evaluating diagnostic accuracy of large language models in neuroradiology cases using image inputs from JAMA neurology and JAMA clinical challenges</article-title><source>Sci Rep</source><year>2025</year><month>11</month><day>27</day><volume>15</volume><issue>1</issue><fpage>43027</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-06458-z</pub-id><pub-id pub-id-type="medline">41309648</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bronson</surname><given-names>I</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>YH</given-names> </name></person-group><article-title>A systematic review and meta-analysis of GPT-based differential diagnostic accuracy in radiological cases: 2023-2025</article-title><source>Front Radiol</source><year>2025</year><volume>5</volume><fpage>1670517</fpage><pub-id pub-id-type="doi">10.3389/fradi.2025.1670517</pub-id><pub-id pub-id-type="medline">41229708</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaczmarczyk</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wilhelm</surname><given-names>TI</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>R</given-names> </name><name name-style="western"><surname>Roos</surname><given-names>J</given-names> </name></person-group><article-title>Evaluating multimodal AI in medical diagnostics</article-title><source>NPJ Digit Med</source><year>2024</year><month>08</month><day>7</day><volume>7</volume><issue>1</issue><fpage>205</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01208-3</pub-id><pub-id pub-id-type="medline">39112822</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schramm</surname><given-names>S</given-names> </name><name name-style="western"><surname>Preis</surname><given-names>S</given-names> </name><name name-style="western"><surname>Metz</surname><given-names>MC</given-names> </name><etal/></person-group><article-title>Impact of multimodal prompt elements on diagnostic performance of GPT-4V in challenging brain MRI cases</article-title><source>Radiology</source><year>2025</year><month>01</month><volume>314</volume><issue>1</issue><fpage>e240689</fpage><pub-id pub-id-type="doi">10.1148/radiol.240689</pub-id><pub-id pub-id-type="medline">39835982</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Busch</surname><given-names>F</given-names> </name><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Makowski</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Truhn</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bressem</surname><given-names>KK</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>L</given-names> </name></person-group><article-title>Integrating text and image analysis: exploring GPT-4V&#x2019;s capabilities in advanced radiological applications across subspecialties</article-title><source>J Med Internet Res</source><year>2024</year><month>05</month><day>1</day><volume>26</volume><fpage>e54948</fpage><pub-id pub-id-type="doi">10.2196/54948</pub-id><pub-id pub-id-type="medline">38691404</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tokumasu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ito</surname><given-names>T</given-names> </name><name name-style="western"><surname>Suzuki</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Evaluating ChatGPT-4's diagnostic accuracy: impact of visual data integration</article-title><source>JMIR Med Inform</source><year>2024</year><month>04</month><day>9</day><volume>12</volume><fpage>e55627</fpage><pub-id pub-id-type="doi">10.2196/55627</pub-id><pub-id pub-id-type="medline">38592758</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Horiuchi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Tatekawa</surname><given-names>H</given-names> </name><name name-style="western"><surname>Oura</surname><given-names>T</given-names> </name><etal/></person-group><article-title>ChatGPT&#x2019;s diagnostic performance based on textual vs. visual information compared to radiologists&#x2019; diagnostic performance in musculoskeletal radiology</article-title><source>Eur Radiol</source><year>2025</year><month>01</month><volume>35</volume><issue>1</issue><fpage>506</fpage><lpage>516</lpage><pub-id pub-id-type="doi">10.1007/s00330-024-10902-5</pub-id><pub-id pub-id-type="medline">38995378</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Hidden flaws behind expert-level accuracy of multimodal GPT-4 vision in medicine</article-title><source>NPJ Digit Med</source><year>2024</year><month>07</month><day>23</day><volume>7</volume><issue>1</issue><fpage>190</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01185-7</pub-id><pub-id pub-id-type="medline">39043988</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Heybati</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shammas-Toma</surname><given-names>M</given-names> </name></person-group><article-title>When vision meets reality: exploring the clinical applicability of GPT-4 with vision</article-title><source>Clin Imaging</source><year>2024</year><month>04</month><volume>108</volume><issue>3</issue><fpage>110101</fpage><pub-id pub-id-type="doi">10.1016/j.clinimag.2024.110101</pub-id><pub-id pub-id-type="medline">38341880</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Parillo</surname><given-names>M</given-names> </name><name name-style="western"><surname>Vaccarino</surname><given-names>F</given-names> </name><name name-style="western"><surname>Beomonte Zobel</surname><given-names>B</given-names> </name><name name-style="western"><surname>Mallio</surname><given-names>CA</given-names> </name></person-group><article-title>ChatGPT and radiology report: potential applications and limitations</article-title><source>Radiol Med</source><year>2024</year><month>12</month><volume>129</volume><issue>12</issue><fpage>1849</fpage><lpage>1863</lpage><pub-id pub-id-type="doi">10.1007/s11547-024-01915-7</pub-id><pub-id pub-id-type="medline">39508933</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kamineni</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evaluating GPT as an adjunct for radiologic decision making: GPT-4 versus GPT-3.5 in a breast imaging pilot</article-title><source>J Am Coll Radiol</source><year>2023</year><month>10</month><volume>20</volume><issue>10</issue><fpage>990</fpage><lpage>997</lpage><pub-id pub-id-type="doi">10.1016/j.jacr.2023.05.003</pub-id><pub-id pub-id-type="medline">37356806</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wada</surname><given-names>A</given-names> </name><name name-style="western"><surname>Akashi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shih</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Optimizing GPT-4 turbo diagnostic accuracy in neuroradiology through prompt engineering and confidence thresholds</article-title><source>Diagnostics (Basel)</source><year>2024</year><month>07</month><day>17</day><volume>14</volume><issue>14</issue><fpage>1541</fpage><pub-id pub-id-type="doi">10.3390/diagnostics14141541</pub-id><pub-id pub-id-type="medline">39061677</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Strotzer</surname><given-names>QD</given-names> </name><name name-style="western"><surname>Nieberle</surname><given-names>F</given-names> </name><name name-style="western"><surname>Kupke</surname><given-names>LS</given-names> </name><etal/></person-group><article-title>Toward foundation models in radiology? Quantitative assessment of GPT-4V's multimodal and multianatomic region capabilities</article-title><source>Radiology</source><year>2024</year><month>11</month><volume>313</volume><issue>2</issue><fpage>e240955</fpage><pub-id pub-id-type="doi">10.1148/radiol.240955</pub-id><pub-id pub-id-type="medline">39589253</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bhayana</surname><given-names>R</given-names> </name></person-group><article-title>Chatbots and large language models in radiology: a practical primer for clinical and research applications</article-title><source>Radiology</source><year>2024</year><month>01</month><volume>310</volume><issue>1</issue><fpage>e232756</fpage><pub-id pub-id-type="doi">10.1148/radiol.232756</pub-id><pub-id pub-id-type="medline">38226883</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Appelbaum</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cooper</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kline</surname><given-names>RB</given-names> </name><name name-style="western"><surname>Mayo-Wilson</surname><given-names>E</given-names> </name><name name-style="western"><surname>Nezu</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Rao</surname><given-names>SM</given-names> </name></person-group><article-title>Journal article reporting standards for quantitative research in psychology: the APA Publications and Communications Board task force report</article-title><source>Am Psychol</source><year>2018</year><month>01</month><volume>73</volume><issue>1</issue><fpage>3</fpage><lpage>25</lpage><pub-id pub-id-type="doi">10.1037/amp0000191</pub-id><pub-id pub-id-type="medline">29345484</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dicle</surname><given-names>O</given-names> </name><name name-style="western"><surname>&#x00D6;zan</surname><given-names>S</given-names> </name><name name-style="western"><surname>&#x015E;ahin</surname><given-names>H</given-names> </name><name name-style="western"><surname>Se&#x00E7;il</surname><given-names>M</given-names> </name></person-group><article-title>How to perform an excellent radiology board examination: a web-based checklist</article-title><source>Insights Imaging</source><year>2021</year><month>01</month><day>7</day><volume>12</volume><issue>1</issue><fpage>4</fpage><pub-id pub-id-type="doi">10.1186/s13244-020-00924-0</pub-id><pub-id pub-id-type="medline">33411060</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>Definitions for purposes of this policy (45 CFR &#x00A7;46.102)</article-title><source>National Code of Federal Regulations</source><access-date>2026-04-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ecfr.gov/current/title-45/subtitle-A/subchapter-A/part-46/subpart-A/section-46.102">https://www.ecfr.gov/current/title-45/subtitle-A/subchapter-A/part-46/subpart-A/section-46.102</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mukherjee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hou</surname><given-names>B</given-names> </name><name name-style="western"><surname>Suri</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Evaluation of GPT large language model performance on RSNA 2023 case of the day questions</article-title><source>Radiology</source><year>2024</year><month>10</month><volume>313</volume><issue>1</issue><fpage>e240609</fpage><pub-id pub-id-type="doi">10.1148/radiol.240609</pub-id><pub-id pub-id-type="medline">39352277</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Pareek</surname><given-names>A</given-names> </name><name name-style="western"><surname>Seyyedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>I</given-names> </name><name name-style="western"><surname>Lungren</surname><given-names>MP</given-names> </name></person-group><article-title>Fusion of medical imaging and electronic health records using deep learning: a systematic review and implementation guidelines</article-title><source>NPJ Digit Med</source><year>2020</year><volume>3</volume><issue>1</issue><fpage>136</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-00341-z</pub-id><pub-id pub-id-type="medline">33083571</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Alsentzer</surname><given-names>E</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Boag</surname><given-names>W</given-names> </name></person-group><article-title>Publicly available clinical BERT embeddings</article-title><year>2019</year><conf-name>Proceedings of the 2nd Clinical Natural Language Processing Workshop</conf-name><conf-date>Jun 7, 2019</conf-date><conf-loc>Minneapolis, Minnesota, USA</conf-loc><fpage>72</fpage><lpage>78</lpage><pub-id pub-id-type="doi">10.18653/v1/W19-1909</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Kaushal</surname><given-names>R</given-names> </name><name name-style="western"><surname>Khullar</surname><given-names>D</given-names> </name></person-group><article-title>Should health care demand interpretable artificial intelligence or accept &#x201C;black box&#x201D; medicine?</article-title><source>Ann Intern Med</source><year>2020</year><month>01</month><day>7</day><volume>172</volume><issue>1</issue><fpage>59</fpage><lpage>60</lpage><pub-id pub-id-type="doi">10.7326/M19-2548</pub-id><pub-id pub-id-type="medline">31842204</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Amann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Blasimme</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vayena</surname><given-names>E</given-names> </name><name name-style="western"><surname>Frey</surname><given-names>D</given-names> </name><name name-style="western"><surname>Madai</surname><given-names>VI</given-names> </name><collab>Precise4Q consortium</collab></person-group><article-title>Explainability for artificial intelligence in healthcare: a multidisciplinary perspective</article-title><source>BMC Med Inform Decis Mak</source><year>2020</year><month>11</month><day>30</day><volume>20</volume><issue>1</issue><fpage>310</fpage><pub-id pub-id-type="doi">10.1186/s12911-020-01332-6</pub-id><pub-id pub-id-type="medline">33256715</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schwartz</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Erickson</surname><given-names>BJ</given-names> </name><name name-style="western"><surname>Lucchinetti</surname><given-names>C</given-names> </name></person-group><article-title>Pattern of T2 hypointensity associated with ring-enhancing brain lesions can help to differentiate pathology</article-title><source>Neuroradiology</source><year>2006</year><month>03</month><volume>48</volume><issue>3</issue><fpage>143</fpage><lpage>149</lpage><pub-id pub-id-type="doi">10.1007/s00234-005-0024-5</pub-id><pub-id pub-id-type="medline">16447037</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brady</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Allen</surname><given-names>B</given-names> </name><name name-style="western"><surname>Chong</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Developing, purchasing, implementing and monitoring AI tools in radiology: practical considerations. A multi-society statement from the ACR, CAR, ESR, RANZCR and RSNA</article-title><source>Radiol Artif Intell</source><year>2024</year><month>01</month><volume>6</volume><issue>1</issue><fpage>e230513</fpage><pub-id pub-id-type="doi">10.1148/ryai.230513</pub-id><pub-id pub-id-type="medline">38251899</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dreyer</surname><given-names>KJ</given-names> </name><name name-style="western"><surname>Geis</surname><given-names>JR</given-names> </name></person-group><article-title>When machines think: radiology's next frontier</article-title><source>Radiology</source><year>2017</year><month>12</month><volume>285</volume><issue>3</issue><fpage>713</fpage><lpage>718</lpage><pub-id pub-id-type="doi">10.1148/radiol.2017171183</pub-id><pub-id pub-id-type="medline">29155639</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mongan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Moy</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kahn</surname><given-names>CE</given-names>  <suffix>Jr</suffix></name></person-group><article-title>Checklist for artificial intelligence in medical imaging (CLAIM): a guide for authors and reviewers</article-title><source>Radiol Artif Intell</source><year>2020</year><month>03</month><volume>2</volume><issue>2</issue><fpage>e200029</fpage><pub-id pub-id-type="doi">10.1148/ryai.2020200029</pub-id><pub-id pub-id-type="medline">33937821</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goddard</surname><given-names>K</given-names> </name><name name-style="western"><surname>Roudsari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wyatt</surname><given-names>JC</given-names> </name></person-group><article-title>Automation bias: a systematic review of frequency, effect mediators, and mitigators</article-title><source>J Am Med Inform Assoc</source><year>2012</year><volume>19</volume><issue>1</issue><fpage>121</fpage><lpage>127</lpage><pub-id pub-id-type="doi">10.1136/amiajnl-2011-000089</pub-id><pub-id pub-id-type="medline">21685142</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jabbour</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fouhey</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shepard</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Measuring the impact of AI in the diagnosis of hospitalized patients: a randomized clinical vignette survey study</article-title><source>JAMA</source><year>2023</year><month>12</month><day>19</day><volume>330</volume><issue>23</issue><fpage>2275</fpage><lpage>2284</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.22295</pub-id><pub-id pub-id-type="medline">38112814</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sendak</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>M</given-names> </name><name name-style="western"><surname>Brajer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Balu</surname><given-names>S</given-names> </name></person-group><article-title>Presenting machine learning model information to clinical end users with model facts labels</article-title><source>NPJ Digit Med</source><year>2020</year><volume>3</volume><fpage>41</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-0253-3</pub-id><pub-id pub-id-type="medline">32219182</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>E</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Daneshjou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ouyang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ho</surname><given-names>DE</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>J</given-names> </name></person-group><article-title>How medical AI devices are evaluated: limitations and recommendations from an analysis of FDA approvals</article-title><source>Nat Med</source><year>2021</year><month>04</month><volume>27</volume><issue>4</issue><fpage>582</fpage><lpage>584</lpage><pub-id pub-id-type="doi">10.1038/s41591-021-01312-x</pub-id><pub-id pub-id-type="medline">33820998</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Cruz Rivera</surname><given-names>S</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Calvert</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Denniston</surname><given-names>AK</given-names> </name><collab>SPIRIT-AI and CONSORT-AI Working Group</collab></person-group><article-title>Reporting guidelines for clinical trial reports for interventions involving artificial intelligence: the CONSORT-AI extension</article-title><source>Nat Med</source><year>2020</year><month>09</month><volume>26</volume><issue>9</issue><fpage>1364</fpage><lpage>1374</lpage><pub-id pub-id-type="doi">10.1038/s41591-020-1034-x</pub-id><pub-id pub-id-type="medline">32908283</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Kooten</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>CO</given-names> </name><name name-style="western"><surname>Hofmeijer</surname><given-names>EIS</given-names> </name><etal/></person-group><article-title>A framework to integrate artificial intelligence training into radiology residency programs: preparing the future radiologist</article-title><source>Insights Imaging</source><year>2024</year><month>01</month><day>17</day><volume>15</volume><issue>1</issue><fpage>15</fpage><pub-id pub-id-type="doi">10.1186/s13244-023-01595-3</pub-id><pub-id pub-id-type="medline">38228800</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tajmir</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Alkasab</surname><given-names>TK</given-names> </name></person-group><article-title>Toward augmented radiologists: changes in radiology education in the era of machine learning and artificial intelligence</article-title><source>Acad Radiol</source><year>2018</year><month>06</month><volume>25</volume><issue>6</issue><fpage>747</fpage><lpage>750</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2018.03.007</pub-id><pub-id pub-id-type="medline">29599010</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pesapane</surname><given-names>F</given-names> </name><name name-style="western"><surname>Codari</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sardanelli</surname><given-names>F</given-names> </name></person-group><article-title>Artificial intelligence in medical imaging: threat or opportunity? Radiologists again at the forefront of innovation in medicine</article-title><source>Eur Radiol Exp</source><year>2018</year><month>10</month><day>24</day><volume>2</volume><issue>1</issue><fpage>35</fpage><pub-id pub-id-type="doi">10.1186/s41747-018-0061-6</pub-id><pub-id pub-id-type="medline">30353365</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cabitza</surname><given-names>F</given-names> </name><name name-style="western"><surname>Rasoini</surname><given-names>R</given-names> </name><name name-style="western"><surname>Gensini</surname><given-names>GF</given-names> </name></person-group><article-title>Unintended consequences of machine learning in medicine</article-title><source>JAMA</source><year>2017</year><month>08</month><day>8</day><volume>318</volume><issue>6</issue><fpage>517</fpage><lpage>518</lpage><pub-id pub-id-type="doi">10.1001/jama.2017.7797</pub-id><pub-id pub-id-type="medline">28727867</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Paranjape</surname><given-names>K</given-names> </name><name name-style="western"><surname>Schinkel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Nannan Panday</surname><given-names>R</given-names> </name><name name-style="western"><surname>Car</surname><given-names>J</given-names> </name><name name-style="western"><surname>Nanayakkara</surname><given-names>P</given-names> </name></person-group><article-title>Introducing artificial intelligence training in medical education</article-title><source>JMIR Med Educ</source><year>2019</year><month>12</month><day>3</day><volume>5</volume><issue>2</issue><fpage>e16048</fpage><pub-id pub-id-type="doi">10.2196/16048</pub-id><pub-id pub-id-type="medline">31793895</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>PH</given-names> </name><name name-style="western"><surname>Siegel</surname><given-names>EL</given-names> </name></person-group><article-title>Medical student perspectives on the impact of artificial intelligence on the practice of medicine</article-title><source>Curr Probl Diagn Radiol</source><year>2021</year><volume>50</volume><issue>5</issue><fpage>614</fpage><lpage>619</lpage><pub-id pub-id-type="doi">10.1067/j.cpradiol.2020.06.011</pub-id><pub-id pub-id-type="medline">32680632</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bruno</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Walker</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Abujudeh</surname><given-names>HH</given-names> </name></person-group><article-title>Understanding and confronting our mistakes: the epidemiology of error in radiology and strategies for error reduction</article-title><source>Radiographics</source><year>2015</year><month>10</month><volume>35</volume><issue>6</issue><fpage>1668</fpage><lpage>1676</lpage><pub-id pub-id-type="doi">10.1148/rg.2015150023</pub-id><pub-id pub-id-type="medline">26466178</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waite</surname><given-names>S</given-names> </name><name name-style="western"><surname>Scott</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gale</surname><given-names>B</given-names> </name><name name-style="western"><surname>Fuchs</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kolla</surname><given-names>S</given-names> </name><name name-style="western"><surname>Reede</surname><given-names>D</given-names> </name></person-group><article-title>Interpretive error in radiology</article-title><source>AJR Am J Roentgenol</source><year>2017</year><month>04</month><volume>208</volume><issue>4</issue><fpage>739</fpage><lpage>749</lpage><pub-id pub-id-type="doi">10.2214/AJR.16.16963</pub-id><pub-id pub-id-type="medline">28026210</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Krupinski</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Berbaum</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Caldwell</surname><given-names>RT</given-names> </name><name name-style="western"><surname>Schartz</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name></person-group><article-title>Long radiology workdays reduce detection and accommodation accuracy</article-title><source>J Am Coll Radiol</source><year>2010</year><month>09</month><volume>7</volume><issue>9</issue><fpage>698</fpage><lpage>704</lpage><pub-id pub-id-type="doi">10.1016/j.jacr.2010.03.004</pub-id><pub-id pub-id-type="medline">20816631</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Busby</surname><given-names>LP</given-names> </name><name name-style="western"><surname>Courtier</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Glastonbury</surname><given-names>CM</given-names> </name></person-group><article-title>Bias in radiology: the how and why of misses and misinterpretations</article-title><source>Radiographics</source><year>2018</year><volume>38</volume><issue>1</issue><fpage>236</fpage><lpage>247</lpage><pub-id pub-id-type="doi">10.1148/rg.2018170107</pub-id><pub-id pub-id-type="medline">29194009</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mysko</surname><given-names>WK</given-names> </name><name name-style="western"><surname>Weller</surname><given-names>GE</given-names> </name><etal/></person-group><article-title>Interpretation of emergency department radiographs: a comparison of emergency medicine physicians with radiologists, residents with faculty, and film with digital display</article-title><source>AJR Am J Roentgenol</source><year>2000</year><month>11</month><volume>175</volume><issue>5</issue><fpage>1233</fpage><lpage>1238</lpage><pub-id pub-id-type="doi">10.2214/ajr.175.5.1751233</pub-id><pub-id pub-id-type="medline">11044013</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Larson</surname><given-names>DB</given-names> </name><name name-style="western"><surname>Nance</surname><given-names>JJ</given-names> </name></person-group><article-title>Rethinking peer review: what aviation can teach radiology about performance improvement</article-title><source>Radiology</source><year>2011</year><month>06</month><volume>259</volume><issue>3</issue><fpage>626</fpage><lpage>632</lpage><pub-id pub-id-type="doi">10.1148/radiol.11102222</pub-id><pub-id pub-id-type="medline">21602501</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Diagnostic performance and self-reported modality attribution of GPT-4 with Vision in neuroradiology cases.</p><media xlink:href="neuro_v5i1e69708_app1.docx" xlink:title="DOCX File, 47 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Study flow diagram and case distribution by pathology category for cross-sectional evaluation of GPT-4 with Vision diagnostic performance in neuroradiology.</p><media xlink:href="neuro_v5i1e69708_app2.docx" xlink:title="DOCX File, 49 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Complete case catalog with metadata, modality attribution, and performance summary by pathology category.</p><media xlink:href="neuro_v5i1e69708_app3.docx" xlink:title="DOCX File, 53 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Complete prompt template used to elicit diagnostic responses and modality attribution from GPT-4 with Vision, with representative example response.</p><media xlink:href="neuro_v5i1e69708_app4.docx" xlink:title="DOCX File, 329 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Example of neuroradiology case questions for GPT-4 with Vision evaluation.</p><media xlink:href="neuro_v5i1e69708_app5.docx" xlink:title="DOCX File, 440 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Operational definitions, measurement specifications, and validation rules for primary and exploratory variables in GPT-4 with Vision neuroradiology diagnostic study.</p><media xlink:href="neuro_v5i1e69708_app6.docx" xlink:title="DOCX File, 49 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Data quality verification, statistical assumptions testing, and comprehensive descriptive statistics, including completeness analysis, outlier detection, normality assessment, and variance homogeneity for GPT-4 with Vision neuroradiology study.</p><media xlink:href="neuro_v5i1e69708_app7.docx" xlink:title="DOCX File, 49 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Post hoc power analysis, sensitivity analysis, and sample size justification for cross-sectional study of GPT-4 with Vision diagnostic performance in neuroradiology.</p><media xlink:href="neuro_v5i1e69708_app8.docx" xlink:title="DOCX File, 47 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Complete statistical analysis results including primary diagnostic accuracy, exploratory modality attribution comparisons, and distribution patterns for GPT-4 with Vision neuroradiology evaluation.</p><media xlink:href="neuro_v5i1e69708_app9.docx" xlink:title="DOCX File, 49 KB"/></supplementary-material></app-group></back></article>