Neuron2Graph/index.html at main · apartresearch/Neuron2Graph · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299

<!DOCTYPE html>
<html>

<head lang="en">
    <meta charset="UTF-8">
    <meta http-equiv="x-ua-compatible" content="ie=edge">

    <title>Neuron2Graph</title>

    <meta name="description" content="N2G: A Scalable Approach for Quantifying Interpretable Neuron Representation in Large Language Models">
    <meta name="viewport" content="width=device-width, initial-scale=1">

    <!-- <base href="/"> -->

        <!--FACEBOOK-->
    <meta property="og:image" content="https://transformer-activation-tools.pages.dev/img/N2G.png">
    <meta property="og:image:type" content="image/png">
    <meta property="og:type" content="website" />
    <meta property="og:url" content="https://transformer-activation-tools.pages.dev/"/>
    <meta property="og:title" content="Neuron2Graph" />
    <meta property="og:description" content="Project page for Neuron2Graph." />

        <!--TWITTER-->
    <meta name="twitter:card" content="summary_large_image" />
    <meta name="twitter:title" content="Neuron2Graph" />
    <meta name="twitter:description" content="Project page for Neuron2Graph." />
    <meta name="twitter:image" content="https://transformer-activation-tools.pages.dev/img/N2G.png" />


<!--     <link rel="apple-touch-icon" href="apple-touch-icon.png"> -->
  <!-- <link rel="icon" type="image/png" href="img/seal_icon.png"> -->
    <!-- Place favicon.ico in the root directory -->

    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/css/bootstrap.min.css">
    <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/font-awesome/4.4.0/css/font-awesome.min.css">
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.8.0/codemirror.min.css">
    <link rel="stylesheet" href="css/app.css">

    <link rel="stylesheet" href="css/bootstrap.min.css">

    <script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.3/jquery.min.js"></script>
    <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.5/js/bootstrap.min.js"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/codemirror/5.8.0/codemirror.min.js"></script>
    <script src="https://cdnjs.cloudflare.com/ajax/libs/clipboard.js/1.5.3/clipboard.min.js"></script>

    <script src="js/app.js"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());

  gtag('config', 'G-52J0PM8XKV');
</script>

    <style>
        .nav-pills {
          position: relative;
          display: inline;
        }
        .imtip {
          position: absolute;
          top: 0;
          left: 0;
        }
    </style>
</head>

<body>
    <div class="container" id="main">
        <div class="row">
            <h2 class="col-md-12 text-center">
                <b><font size="+4">Neuron to Graph</font></b>
                <br>
                <b><font size="+2">Interpreting Language Model Neurons at Scale</font></b>
            </h2>
            </br>
                <!--<small>
                    CoRL 2021
                </small>-->
            </h2>
        </div>
        <div class="row">
            <div class="col-md-12 text-center">
                <ul class="list-inline">
                <br>
                <li>Alex Foote<sup>1</sup>*, Neel Nanda<sup>2</sup>, Esben Kran<sup>1</sup>, Ionnis Konstas<sup>3</sup>, Shay Cohen<sup>4</sup>, Fazl Barez<sup>1,4,5</sup>*</li>
                <br><br>
                    <!-- <a href="http://g.co/robotics">
                    <image src="img/robotics-at-google.png" height="40px"> Apart Research</a>
                    <a href="https://everydayrobots.com">
                    <image src="img/EverydayRobots2.gif" height="40px"> Everyday Robots</a> <br><br> -->
                    <sup>1</sup>Apart Research&#8287;&#8287;<sup>2</sup>Independent&#8287;&#8287;<sup>3</sup>School of Mathematical and Computer Sciences Heriot-Watt University&#8287;&#8287;<sup>4</sup>School of Informatics, University of Edinburgh&#8287;&#8287;<sup>5</sup>University of Oxford <br> * Equal contribution
                </ul>
            </div>
        </div>

        <div class="row">
                <div class="col-md-4 col-md-offset-4 text-center">
                    <ul class="nav nav-pills nav-justified">
                        <li>
                            <a href="https://arxiv.org/abs/2305.19911">
                            <image src="img/paper-tb.png" height="60px" width="40px">
                                <h4><strong>arXiv</strong></h4>
                            </a>
                        </li>
                    <!-- <li>
                            <a href="https://youtu.be/ysFav0b472w">
                            <image src="img/youtube_icon.png" height="60px">
                                <h4><strong>Video</strong></h4>
                            </a>
                        </li>
                        <li>
                            <a href="https://ai.googleblog.com/2022/08/towards-helpful-robots-grounding.html">
                            <image src="img/google-ai-blog-small.png" height="60px">
                                <image src="img/new.png" height="20px" class="imtip">
                                <h4><strong>Blogpost</strong></h4>
                            </a>
                        </li>
                         <li>
                            <a href="https://github.com/google-research/google-research/tree/master/saycan">
                            <image src="img/github.png" height="60px">
                                <h4><strong>Code</strong></h4>
                            </a>
                        </li> -->
                    </ul>
                </div>
        </div>

         <!-- <div class="row">
            <div class="col-md-8 col-md-offset-2">

                <h3>
                    What's New
                </h3>
                <p class="text-justify">

                <ul>
                    <li> <font color="#5a00b4">[8/16/2022]</font> We integrated SayCan with <a href="https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html">Pathways Language Model (PaLM)</a>, and updated the results. We also added <a href="#new-capability"> new capabilities</a> including drawer manipulation, chain of thought prompting and multilingual instructions. You can see all the new results in the updated <a href="assets/Neuron2Graph.pdf">paper</a>.</li>
                    <li><font color="#5a00b4">[8/16/2022]</font> Our updated results show that SayCan combined with the improved language model (PaLM), which we refer to as PaLM-SayCan, improves the <b>robotics performance</b> of the entire system compared to a previous LLM (FLAN). PaLM-SayCan chooses the correct sequence of skills 84% of the time and executes them successfully 74% of the time, reducing errors by a half compared to FLAN.  This is particularly exciting because it represents the first time we can see how an improvement in language models translates to a similar improvement in robotics.  </li>
                    <li><font color="#5a00b4">[8/16/2022]</font> We <a href="#open-source">open-sourced</a> a version of SayCan on a simulated tabletop environment. </li>
                    <li> [4/4/2022] Initial release of SayCan. </li>
                </ul>
            </div>
        </div> -->


        <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <!-- <p style="text-align:center;">
        	    	<video id="v0" width="100%" playsinline autoplay muted loop controls>
                       <source src="img/demo_sequence_compressed.mp4" type="video/mp4">
                   </video>
                </p> -->
                <h3>
                    Abstract
                </h3>
                <p class="text-justify">
                    Understanding the function of individual neurons within language models is essential for mechanistic interpretability research. We propose Neuron to Graph
(N2G), a tool which takes a neuron and its dataset examples, and automatically
distills the neuron's behaviour on those examples to an interpretable graph. This
presents a less labour intensive approach to interpreting neurons than current manual methods, that will better scale these methods to large language models (LLMs). We use truncation and
saliency methods to only present the important tokens, and augment the dataset examples with more diverse samples to better capture the extent of neuron behaviour.
These graphs can be visualised to aid manual interpretation by researchers, but
can also output token activations on text to compare to the neuron's ground truth
activations for automatic validation. N2G represents a step towards scalable interpretability methods by allowing us to convert neurons in an LLM to interpretable
representations of measurable quality.
                </p>
            </div>
        </div>


	<!-- <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Video
                </h3>
                <div class="text-center">
                    <div style="position:relative;padding-top:56.25%;">
                        <iframe width="560" height="315" src="https://www.youtube.com/embed/Th6vwOtUt3k" allowfullscreen style="position:absolute;top:0;left:0;width:100%;height:100%;"></iframe>
                    </div>
                </div>
            </div>
        </div> -->


        <div class="row">
            <div class="col-md-8 col-md-offset-2">

<p style="text-align:center;">
    <image src="img/architecture.png"  class="img-responsive" height="600px">
        <i>Figure 1: Overall architecture of N2G. Activations of the target neuron on the dataset examples
are retrieved (neuron and activating tokens in red). Prompts are pruned and the importance of each
token for neuron activation is measured (important tokens in blue). Pruned prompts are augmented
by replacing important tokens with high-probability substitutes using BERT. The augmented set of
prompts are converted to a graph. The output graph is a real example which activates on the token
“except” when preceded by any of the other tokens.</i>

</p>

<br>
<p style="text-align:center;">
    <image src="img/example_with.png"  class="img-responsive" height="600px">
        <i>Figure 2: An example of a graph built from Neuron 2 of Layer 1 of the model.</i>

</p>
<br>
<p style="text-align:center;">
    <image src="img/eval_table.png"  class="img-responsive" height="600px">
        <i>Table 1: Precision, recall and F1-score of the neuron graphs' token-level predictions of neuron firing
            compared to ground truth on held-out test data, for 50 random neurons from each layer of the model.
            Tokens on which the real neuron fired and tokens on which it didn't fire are evaluated separately as
            there are generally many more tokens on which a neuron didn't fire, making it trivially easy to get
            near-perfect scores by always predicting the neuron will not fire.</i>

</p>

<br><br>

<p><b><i>Work in progress</i></b></p>

<br><br>

        <!-- <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Results
                </h3>
		<p class="text-justify">
		We found that when we ran the code, the results came out.
		</p>

                <p style="text-align:center;">
                    <image src="img/graph_10_examples_n_20.png"  class="img-responsive" height="600px">
                </p>
	        </div>
        </div> -->


         <!-- <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Citation
                </h3> <a href="https://arxiv.org/abs/2204.01691">[arxiv version]</a>
                <div class="form-group col-md-10 col-md-offset-1">
                    <textarea id="bibtex" class="form-control" readonly>
@inproceedings{acl2023interpretllmn,
    title={Interpreting Large Language Model Neurons},
    author={Anonymous authors},
    booktitle={ACL submission},
    year={2023}
}</textarea>
                </div>
            </div>

        </div> -->


         <!-- <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    <font color="#5a00b4">Models used in the paper</font>
                </h3>
               <font color="#5a00b4">The models are available...<a href="https://huggingface.co/gpt2">[gpt-2]</a></font>
            </div>
            <br/><br/>
        </div> -->


         <!-- <div class="row">
            <div id="open-source" class="col-md-8 col-md-offset-2">
                <h3>
                    <font color="#5a00b4">Open Source</font>
                </h3>
              <font color="#5a00b4">We open source a version of SayCan that works with a simulated tabletop environment. <a href="https://github.com/google-research/google-research/tree/master/saycan">[tabletop saycan] </a> </font>
              <p style="text-align:center;">
                    <img src="img/open_source_tabletop.png" class="img-responsive" height="600px">
                </p>
            </div>
        </div>

        <div class="row">
            <div class="col-md-8 col-md-offset-2">
                <h3>
                    Acknowledgements
                </h3>
                <p class="text-justify">
The authors would like to thank Fred Alcober, Yunfei Bai, Matt Bennice, Maarten Bosma, Justin Boyd, Bill Byrne, Kendra Byrne, Noah Constant, Pete Florence, Laura Graesser, Rico Jonschkowski, Daniel Kappler, Hugo Larochelle, Benjamin Lee, Adrian Li, Maysam Moussalem, Suraj Nair, Jane Park, Evan Rapoport, Krista Reymann, Jeff Seto, Dhruv Shah, Ian Storz, Razvan Surdulescu, Tom Small, Jason Wei, and Vincent Zhao for their help and support in various aspects of the project.
                    <br><br>
                The website template was borrowed from <a href="http://jonbarron.info/">Jon Barron</a>.
                </p>
            </div>
        </div> -->
    </div>
    <!-- 100% privacy friendly analytics -->
<script async defer src="https://scripts.simpleanalyticscdn.com/latest.js"></script>
<noscript><img src="https://queue.simpleanalyticscdn.com/noscript.gif" alt="" referrerpolicy="no-referrer-when-downgrade" /></noscript>
</body>
</html>