Spaces:

microsoft
/

MInference

Running on Zero

App Files Files Community

iofu728 commited on Jun 16

Commit

43a7079

•

1 Parent(s): 9310ba1

Feature(MInference): build demo

Browse files

Files changed (25) hide show

.gitignore +415 -0
LICENSE +21 -0
README.md +130 -1
app.py +145 -4
images/MInference1_onepage.png +0 -0
images/MInference_logo.png +0 -0
images/benchmarks/needle_viz_LLaMA-3-8B-1M_ours_1K_1000K.png +0 -0
images/benchmarks/ppl-LLaMA-3-262k.png +0 -0
minference/__init__.py +27 -0
minference/configs/Llama_3_8B_Instruct_262k_kv_out_v32_fit_o_best_pattern.json +1 -0
minference/configs/Phi_3_mini_128k_instruct_kv_out_v32_fit_o_best_pattern.json +1 -0
minference/configs/Yi_9B_200k_kv_out_v32_fit_o_best_pattern.json +1 -0
minference/configs/model2path.py +17 -0
minference/minference_configuration.py +49 -0
minference/models_patch.py +100 -0
minference/modules/inf_llm.py +1296 -0
minference/modules/minference_forward.py +855 -0
minference/modules/snap_kv.py +422 -0
minference/ops/block_sparse_flash_attention.py +464 -0
minference/ops/pit_sparse_flash_attention.py +740 -0
minference/ops/pit_sparse_flash_attention_v2.py +735 -0
minference/ops/streaming_kernel.py +763 -0
minference/patch.py +1279 -0
minference/version.py +14 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,415 @@

+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/main/VisualStudio.gitignore
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+# Mono auto generated files
+mono_crash.*
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Ll]og/
+[Ll]ogs/
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+# StyleCop
+StyleCopReport.xml
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.tlog
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+# Chutzpah Test files
+_Chutzpah*
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+# Visual Studio Trace Files
+*.e2e
+# TFS 2012 Local Workspace
+$tf/
+# Guidance Automation Toolkit
+*.gpState
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+# TeamCity is a build add-in
+_TeamCity*
+# DotCover is a Code Coverage Tool
+*.dotCover
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+# Web workbench (sass)
+.sass-cache/
+# Installshield output folder
+[Ee]xpress/
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+# Click-Once directory
+publish/
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+# Microsoft Azure Emulator
+ecf/
+rcf/
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+# RIA/Silverlight projects
+Generated_Code/
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+# Microsoft Fakes
+FakesAssemblies/
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+# Visual Studio 6 build log
+*.plg
+# Visual Studio 6 workspace options file
+*.opt
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+# Visual Studio 6 auto-generated project file (contains which files were open etc.)
+*.vbp
+# Visual Studio 6 workspace and project file (working project files containing files to include in project)
+*.dsw
+*.dsp
+# Visual Studio 6 technical files
+*.ncb
+*.aps
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+# FAKE - F# Make
+.fake/
+# CodeRush personal settings
+.cr/personal
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+# Tabs Studio
+*.tss
+# Telerik's JustMock configuration file
+*.jmconfig
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+# OpenCover UI analysis results
+OpenCover/
+# Azure Stream Analytics local run output
+ASALocalRun/
+# MSBuild Binary and Structured Log
+*.binlog
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+# Local History for Visual Studio
+.localhistory/
+# Visual Studio History (VSHistory) files
+.vshistory/
+# BeatPulse healthcheck temp database
+healthchecksdb
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+# VS Code files for those working on multiple tools
+.vscode/*
+!.vscode/settings.json
+!.vscode/tasks.json
+!.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+# Windows Installer files from build outputs
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# JetBrains Rider
+*.sln.iml
+# Experiments
+data
+!experiments/ruler/data
+needle
+results
+*.json
+*.jsonl
+.vscode/
+*.pt
+*.pkl
+!minference/configs/*
+__pycache__
+build/
+*.egg-info/
+*.so

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+    MIT License
+    Copyright (c) Microsoft Corporation.
+    Permission is hereby granted, free of charge, to any person obtaining a copy
+    of this software and associated documentation files (the "Software"), to deal
+    in the Software without restriction, including without limitation the rights
+    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+    copies of the Software, and to permit persons to whom the Software is
+    furnished to do so, subject to the following conditions:
+    The above copyright notice and this permission notice shall be included in all
+    copies or substantial portions of the Software.
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE

README.md CHANGED Viewed

@@ -10,4 +10,133 @@ pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 license: mit
 ---
+<div style="display: flex; align-items: center;">
+    <div style="width: 100px; margin-right: 10px; height:auto;" align="left">
+        <img src="images/MInference_logo.png" alt="MInference" width="100" align="left">
+    </div>
+    <div style="flex-grow: 1;" align="center">
+        <h2 align="center">MInference: Million-Tokens Prompt Inference for LLMs</h2>
+    </div>
+</div>
+<p align="center">
+    | <a href="https://llmlingua.com/"><b>Project Page</b></a> |
+    <a href="https://arxiv.org/abs/2406."><b>Paper</b></a> |
+    <a href="https://huggingface.co/spaces/microsoft/MInference"><b>Demo</b></a> |
+</p>
+https://github.com/microsoft/MInference/assets/30883354/52613efc-738f-4081-8367-7123c81d6b19
+## TL;DR
+**MInference 1.0** leverages the dynamic sparse nature of LLMs' attention, which exhibits some static patterns, to speed up the pre-filling for long-context LLMs. It first determines offline which sparse pattern each head belongs to, then approximates the sparse index online and dynamically computes attention with the optimal custom kernels. This approach achieves up to a **10x speedup** for pre-filling on an A100 while maintaining accuracy.
+- [MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via Dynamic Sparse Attention](https://arxiv.org/abs/2406.) (Under Review)<br>
+  _Huiqiang Jiang†, Yucheng Li†, Chengruidong Zhang†, Qianhui Wu, Xufang Luo, Surin Ahn, Zhenhua Han, Amir H. Abdi, Dongsheng Li, Chin-Yew Lin, Yuqing Yang and Lili Qiu_
+## 🎥 Overview
+![Onepage of MInference](./images/MInference1_onepage.png)
+## 🎯 Quick Start
+### Requirements
+- Torch
+- FlashAttention-2
+- Triton == 2.1.0
+To get started with MInference, simply install it using pip:
+```bash
+pip install minference
+```
+### How to use MInference
+for HF,
+```diff
+from transformers import pipeline
++from minference import MInference
+pipe = pipeline("text-generation", model=model_name, torch_dtype="auto", device_map="auto")
+# Patch MInference Module
++minference_patch = MInference("minference", model_name)
++pipe.model = minference_patch(pipe.model)
+pipe(prompt, max_length=10)
+```
+for vLLM,
+```diff
+from vllm import LLM, SamplingParams
++ from minference import MInference
+llm = LLM(model_name, max_num_seqs=1, enforce_eager=True, max_model_len=128000)
+# Patch MInference Module
++minference_patch = MInference("vllm", model_name)
++llm = minference_patch(llm)
+outputs = llm.generate(prompts, sampling_params)
+```
+## FAQ
+For more insights and answers, visit our [FAQ section](./Transparency_FAQ.md).
+**Q1: How to effectively evaluate the impact of dynamic sparse attention on the capabilities of long-context LLMs?**
+To effectively evaluate long-context LLM capabilities, we tested: 1) effective  context window with RULER, 2) general long-context tasks with InfiniteBench, 3) retrieval tasks across different contexts and positions with Needle in a Haystack, and 4) language model prediction with PG-19.<br/>
+We found that traditional methods perform poorly in retrieval tasks, with difficulty levels varying as follows: KV retrieval (every key as a needle) > Needle in a Haystack > Retrieval.Number > Retrieval PassKey. The key challenge is the semantic difference between needles and the haystack. Traditional methods perform better when the semantic difference is larger, as in passkey tasks. KV retrieval demands higher retrieval capabilities since any key can be a target, and multi-needle tasks are even more complex.<br/>
+We will continue to update our results with more models and datasets in future versions.
+**Q2: Does this dynamic sparse attention pattern only exist in long-context LLMs that are not fully trained?**
+Firstly, attention is dynamically sparse, and this is true for both short- and long-contexts, a characteristic inherent to the attention mechanism.
+Additionally, we selected the state-of-the-art open-source long-context LLM, LLaMA-3-8B-Instruct-1M, which has an effective context window size of 16K. With MInference, this can be extended to 32K.
+We will continue to adapt our method to other advanced long-context LLMs and update our results. We will also explore the theoretical reasons behind this dynamic sparse attention pattern.
+**Q3: What is the relationship between MInference, SSM, Linear Attention, and Sparse Attention?**
+All four approaches (MInference, SSM, Linear Attention, and Sparse Attention) are efficient solutions for optimizing the high complexity of attention in Transformers, each introducing inductive bias from different perspectives. Notably, the latter three require training from scratch.
+Additionally, recent works like Mamba-2 and Unified Implicit Attention Representation unify SSM and Linear Attention as static sparse attention. Mamba-2 itself is a block-wise sparse attention method.
+Intuitively, the significant sparse redundancy in attention suggests that these approaches have potential. However, static sparse attention may not handle dynamic semantic associations well, especially in complex tasks. Dynamic sparse attention, on the other hand, holds potential for better managing these dynamic relationships.
+## Citation
+If you find MInference useful or relevant to your project and research, please kindly cite our paper:
+```bibtex
+@article{jiang2024minference,
+    title={MInference 1.0: Accelerating Pre-filling for Long-Context LLMs via Dynamic Sparse Attention},
+    author={Jiang, Huiqiang and Li, Yucheng and Zhang, Chengruidong and Wu, Qianhui and Luo, Xufang and Ahn, Surin and Han, Zhenhua and Abdi, Amir H and Li, Dongsheng and Lin, Chin-Yew and Yang, Yuqing and Qiu, Lili},
+    journal={arXiv},
+    year={2024}
+}
+```
+## Contributing
+This project welcomes contributions and suggestions.  Most contributions require you to agree to a
+Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
+the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
+When you submit a pull request, a CLA bot will automatically determine whether you need to provide
+a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
+provided by the bot. You will only need to do this once across all repos using our CLA.
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
+For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
+contact [[email protected]](mailto:[email protected]) with any additional questions or comments.
+## Trademarks
+This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
+trademarks or logos is subject to and must follow
+[Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
+Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
+Any use of third-party trademarks or logos are subject to those third-party's policies.

app.py CHANGED Viewed

@@ -1,7 +1,148 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+import os
+import spaces
+from transformers import GemmaTokenizer, AutoModelForCausalLM
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+from threading import Thread
+from minference import MInference
+# Set an environment variable
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+DESCRIPTION = '''
+<div>
+<h1 style="text-align: center;">Meta Llama3 8B</h1>
+<p>This Space demonstrates the instruction-tuned model <a href="https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct"><b>Meta Llama3 8b Chat</b></a>. Meta Llama3 is the new open LLM and comes in two sizes: 8b and 70b. Feel free to play with it, or duplicate to run privately!</p>
+<p>🔎 For more details about the Llama3 release and how to use the model with <code>transformers</code>, take a look <a href="https://huggingface.co/blog/llama3">at our blog post</a>.</p>
+<p>🦕 Looking for an even more powerful model? Check out the <a href="https://huggingface.co/chat/"><b>Hugging Chat</b></a> integration for Meta Llama 3 70b</p>
+</div>
+'''
+LICENSE = """
+<p/>
+---
+Built with Meta Llama 3
+"""
+PLACEHOLDER = """
+<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <img src="https://ysharma-dummy-chat-app.hf.space/file=/tmp/gradio/8e75e61cc9bab22b7ce3dec85ab0e6db1da5d107/Meta_lockup_positive%20primary_RGB.jpg" style="width: 80%; max-width: 550px; height: auto; opacity: 0.55;  ">
+   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Meta llama3</h1>
+   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
+</div>
+"""
+css = """
+h1 {
+  text-align: center;
+  display: block;
+}
+#duplicate-button {
+  margin: auto;
+  color: white;
+  background: #1565c0;
+  border-radius: 100vh;
+}
+"""
+# Load the tokenizer and model
+model_name = "gradientai/Llama-3-8B-Instruct-262k"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")  # to("cuda:0")
+minference_patch = MInference("minference", model_name)
+model = minference_patch(model)
+terminators = [
+    tokenizer.eos_token_id,
+    tokenizer.convert_tokens_to_ids("<|eot_id|>")
+]
+@spaces.GPU(duration=120)
+def chat_llama3_8b(message: str,
+              history: list,
+              temperature: float,
+              max_new_tokens: int
+             ) -> str:
+    """
+    Generate a streaming response using the llama3-8b model.
+    Args:
+        message (str): The input message.
+        history (list): The conversation history used by ChatInterface.
+        temperature (float): The temperature for generating the response.
+        max_new_tokens (int): The maximum number of new tokens to generate.
+    Returns:
+        str: The generated response.
+    """
+    conversation = []
+    for user, assistant in history:
+        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        input_ids= input_ids,
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=temperature,
+        eos_token_id=terminators,
+    )
+    # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
+    if temperature == 0:
+        generate_kwargs['do_sample'] = False
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        #print(outputs)
+        yield "".join(outputs)
+# Gradio block
+chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
+with gr.Blocks(fill_height=True, css=css) as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
+    gr.ChatInterface(
+        fn=chat_llama3_8b,
+        chatbot=chatbot,
+        fill_height=True,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
+        additional_inputs=[
+            gr.Slider(minimum=0,
+                      maximum=1,
+                      step=0.1,
+                      value=0.95,
+                      label="Temperature",
+                      render=False),
+            gr.Slider(minimum=128,
+                      maximum=4096,
+                      step=1,
+                      value=512,
+                      label="Max new tokens",
+                      render=False ),
+            ],
+        examples=[
+            ['How to setup a human base on Mars? Give short answer.'],
+            ['Explain theory of relativity to me like I’m 8 years old.'],
+            ['What is 9,000 * 9,000?'],
+            ['Write a pun-filled happy birthday message to my friend Alex.'],
+            ['Justify why a penguin might make a good king of the jungle.']
+            ],
+        cache_examples=False,
+                     )
+    gr.Markdown(LICENSE)
+if __name__ == "__main__":
+    demo.launch()

images/MInference1_onepage.png ADDED Viewed

images/MInference_logo.png ADDED Viewed

images/benchmarks/needle_viz_LLaMA-3-8B-1M_ours_1K_1000K.png ADDED Viewed

images/benchmarks/ppl-LLaMA-3-262k.png ADDED Viewed

minference/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright (c) 2024 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# flake8: noqa
+from .minference_configuration import MInferenceConfig
+from .models_patch import MInference
+from .ops.block_sparse_flash_attention import block_sparse_attention
+from .ops.pit_sparse_flash_attention_v2 import vertical_slash_sparse_attention
+from .ops.streaming_kernel import streaming_forward
+from .patch import (
+    minference_patch,
+    minference_patch_kv_cache_cpu,
+    minference_patch_with_snapkv,
+    patch_hf,
+)
+from .version import VERSION as __version__
+__all__ = [
+    "MInference",
+    "MInferenceConfig",
+    "minference_patch",
+    "minference_patch_kv_cache_cpu",
+    "minference_patch_with_snapkv",
+    "patch_hf",
+    "vertical_slash_sparse_attention",
+    "block_sparse_attention",
+    "streaming_forward",
+]

minference/configs/Llama_3_8B_Instruct_262k_kv_out_v32_fit_o_best_pattern.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"0": ["vertical_and_slash", 1000, 6096, 336], "1": ["vertical_and_slash", 1000, 6096, 26473], "2": ["vertical_and_slash", 1000, 6096, 0], "3": ["vertical_and_slash", 1000, 6096, 26958], "4": ["vertical_and_slash", 1000, 6096, 18905], "5": ["vertical_and_slash", 1000, 6096, 27990], "6": ["vertical_and_slash", 1000, 6096, 15162], "7": ["vertical_and_slash", 1000, 6096, 10529], "8": ["vertical_and_slash", 1000, 6096, 2], "9": ["vertical_and_slash", 1000, 6096, 828], "10": ["vertical_and_slash", 1000, 6096, 11405], "11": ["vertical_and_slash", 1000, 6096, 0], "12": ["vertical_and_slash", 1000, 6096, 55], "13": ["vertical_and_slash", 1000, 6096, 1], "14": ["vertical_and_slash", 1000, 6096, 0], "15": ["vertical_and_slash", 1000, 6096, 7021], "16": ["vertical_and_slash", 30, 800, 185169], "17": ["vertical_and_slash", 30, 800, 72929], "18": ["vertical_and_slash", 30, 800, 460008], "19": ["vertical_and_slash", 1000, 6096, 0], "20": ["vertical_and_slash", 1000, 6096, 71729], "21": ["vertical_and_slash", 1000, 6096, 52], "22": ["vertical_and_slash", 1000, 6096, 636], "23": ["vertical_and_slash", 1000, 6096, 75020], "24": ["vertical_and_slash", 1000, 6096, 23545], "25": ["vertical_and_slash", 1000, 6096, 90256], "26": ["vertical_and_slash", 1000, 6096, 45294], "27": ["vertical_and_slash", 1000, 6096, 32617], "28": ["vertical_and_slash", 3500, 100, 4777248], "29": ["vertical_and_slash", 3500, 100, 3996], "30": ["vertical_and_slash", 3500, 100, 590252], "31": ["vertical_and_slash", 3500, 100, 0]}, {"0": ["vertical_and_slash", 30, 800, 11048], "1": ["vertical_and_slash", 30, 800, 99768], "2": ["vertical_and_slash", 1000, 6096, 1393328], "3": ["vertical_and_slash", 30, 800, 97570], "4": ["vertical_and_slash", 30, 800, 9], "5": ["vertical_and_slash", 30, 800, 18], "6": ["vertical_and_slash", 30, 800, 216277], "7": ["vertical_and_slash", 30, 800, 148491], "8": ["vertical_and_slash", 100, 800, 543785], "9": ["vertical_and_slash", 1000, 6096, 2343829], "10": ["vertical_and_slash", 100, 800, 251542], "11": ["vertical_and_slash", 30, 800, 1064367], "12": ["vertical_and_slash", 1000, 6096, 6092], "13": ["vertical_and_slash", 30, 800, 12654], "14": ["vertical_and_slash", 1000, 6096, 0], "15": ["vertical_and_slash", 1000, 6096, 101], "16": ["vertical_and_slash", 30, 800, 21873], "17": ["vertical_and_slash", 30, 800, 107039], "18": ["vertical_and_slash", 30, 800, 9011], "19": ["vertical_and_slash", 30, 800, 445736], "20": ["vertical_and_slash", 30, 800, 1906], "21": ["vertical_and_slash", 30, 800, 3058], "22": ["vertical_and_slash", 1000, 6096, 430742], "23": ["vertical_and_slash", 1000, 6096, 181839], "24": ["vertical_and_slash", 30, 800, 125666], "25": ["vertical_and_slash", 30, 800, 704271], "26": ["vertical_and_slash", 30, 800, 14405], "27": ["vertical_and_slash", 30, 800, 70563], "28": ["vertical_and_slash", 1000, 6096, 38630], "29": ["vertical_and_slash", 1000, 6096, 68041], "30": ["vertical_and_slash", 30, 800, 6942], "31": ["vertical_and_slash", 1000, 6096, 35430]}, {"0": ["vertical_and_slash", 30, 800, 2720], "1": ["vertical_and_slash", 1000, 6096, 3045], "2": ["vertical_and_slash", 30, 800, 785], "3": ["vertical_and_slash", 1000, 6096, 14146], "4": ["vertical_and_slash", 100, 800, 315229], "5": ["vertical_and_slash", 1000, 6096, 195280], "6": ["vertical_and_slash", 1000, 6096, 1640055], "7": ["vertical_and_slash", 30, 800, 21026], "8": ["vertical_and_slash", 30, 800, 1082], "9": ["vertical_and_slash", 30, 800, 1851], "10": ["vertical_and_slash", 100, 800, 97766], "11": ["vertical_and_slash", 30, 800, 14401], "12": ["vertical_and_slash", 100, 800, 55741], "13": ["vertical_and_slash", 30, 800, 100674], "14": ["vertical_and_slash", 100, 800, 5597503], "15": ["vertical_and_slash", 1000, 6096, 437796], "16": ["vertical_and_slash", 30, 800, 9647], "17": ["vertical_and_slash", 30, 800, 4590], "18": ["vertical_and_slash", 30, 800, 73], "19": ["vertical_and_slash", 1000, 6096, 823400], "20": ["vertical_and_slash", 1000, 6096, 464893], "21": ["vertical_and_slash", 1000, 6096, 406520], "22": ["vertical_and_slash", 1000, 6096, 49477], "23": ["vertical_and_slash", 30, 800, 25445], "24": ["vertical_and_slash", 30, 800, 172935], "25": ["vertical_and_slash", 30, 800, 125813], "26": ["vertical_and_slash", 30, 800, 35964], "27": ["vertical_and_slash", 30, 800, 64113], "28": ["vertical_and_slash", 30, 800, 8780], "29": ["vertical_and_slash", 30, 800, 7883], "30": ["vertical_and_slash", 30, 800, 3944], "31": ["vertical_and_slash", 30, 800, 1049]}, {"0": ["vertical_and_slash", 1000, 6096, 119045], "1": ["vertical_and_slash", 1000, 6096, 21633], "2": ["vertical_and_slash", 1000, 6096, 54], "3": ["vertical_and_slash", 1000, 6096, 756], "4": ["vertical_and_slash", 30, 800, 1524], "5": ["vertical_and_slash", 30, 800, 7576], "6": ["vertical_and_slash", 30, 800, 212024], "7": ["vertical_and_slash", 30, 800, 106253], "8": ["vertical_and_slash", 30, 800, 4801], "9": ["vertical_and_slash", 30, 800, 311445], "10": ["vertical_and_slash", 30, 800, 31540], "11": ["vertical_and_slash", 30, 800, 7706], "12": ["vertical_and_slash", 1000, 6096, 397], "13": ["vertical_and_slash", 1000, 6096, 40], "14": ["vertical_and_slash", 100, 800, 181], "15": ["vertical_and_slash", 1000, 6096, 15], "16": ["vertical_and_slash", 30, 800, 424080], "17": ["vertical_and_slash", 30, 800, 66114], "18": ["vertical_and_slash", 30, 800, 132526], "19": ["vertical_and_slash", 30, 800, 1478993], "20": ["vertical_and_slash", 1000, 6096, 655153], "21": ["vertical_and_slash", 1000, 6096, 117322], "22": ["vertical_and_slash", 1000, 6096, 572237], "23": ["vertical_and_slash", 1000, 6096, 688623], "24": ["vertical_and_slash", 1000, 6096, 294], "25": ["vertical_and_slash", 1000, 6096, 5035], "26": ["vertical_and_slash", 30, 800, 3874], "27": ["vertical_and_slash", 1000, 6096, 618117], "28": ["vertical_and_slash", 30, 800, 545357], "29": ["vertical_and_slash", 30, 800, 1746675], "30": ["vertical_and_slash", 30, 800, 612225], "31": ["vertical_and_slash", 100, 800, 232415]}, {"0": ["vertical_and_slash", 100, 800, 5379826], "1": ["vertical_and_slash", 100, 800, 4399425], "2": ["vertical_and_slash", 100, 800, 5842], "3": ["vertical_and_slash", 30, 800, 178263], "4": ["vertical_and_slash", 30, 800, 356], "5": ["vertical_and_slash", 30, 800, 2387916], "6": ["vertical_and_slash", 1000, 6096, 216595], "7": ["vertical_and_slash", 30, 800, 466], "8": ["vertical_and_slash", 1000, 6096, 832044], "9": ["vertical_and_slash", 1000, 6096, 59709], "10": ["vertical_and_slash", 1000, 6096, 1194089], "11": ["vertical_and_slash", 1000, 6096, 356408], "12": ["vertical_and_slash", 30, 800, 30528], "13": ["vertical_and_slash", 30, 800, 22217], "14": ["vertical_and_slash", 30, 800, 9162], "15": ["vertical_and_slash", 100, 800, 1641325], "16": ["vertical_and_slash", 1000, 6096, 489936], "17": ["vertical_and_slash", 30, 800, 58107], "18": ["vertical_and_slash", 1000, 6096, 8539], "19": ["vertical_and_slash", 1000, 6096, 508038], "20": ["vertical_and_slash", 100, 800, 2632857], "21": ["vertical_and_slash", 1000, 6096, 79517], "22": ["vertical_and_slash", 30, 800, 330362], "23": ["vertical_and_slash", 1000, 6096, 85961], "24": ["vertical_and_slash", 30, 800, 23942], "25": ["vertical_and_slash", 30, 800, 75337], "26": ["vertical_and_slash", 30, 800, 3544417], "27": ["vertical_and_slash", 30, 800, 146427], "28": ["vertical_and_slash", 1000, 6096, 10561], "29": ["vertical_and_slash", 100, 800, 8759352], "30": ["vertical_and_slash", 100, 800, 8425], "31": ["vertical_and_slash", 30, 800, 22]}, {"0": ["vertical_and_slash", 30, 800, 50473], "1": ["vertical_and_slash", 1000, 6096, 277369], "2": ["vertical_and_slash", 30, 800, 59349], "3": ["vertical_and_slash", 30, 800, 27256], "4": ["vertical_and_slash", 30, 800, 112822], "5": ["vertical_and_slash", 1000, 6096, 346887], "6": ["vertical_and_slash", 1000, 6096, 84774], "7": ["vertical_and_slash", 1000, 6096, 954773], "8": ["vertical_and_slash", 1000, 6096, 1210908], "9": ["vertical_and_slash", 1000, 6096, 1679398], "10": ["vertical_and_slash", 1000, 6096, 2474351], "11": ["vertical_and_slash", 1000, 6096, 80495], "12": ["vertical_and_slash", 30, 800, 56761], "13": ["vertical_and_slash", 30, 800, 27757], "14": ["vertical_and_slash", 30, 800, 8811], "15": ["vertical_and_slash", 30, 800, 31547], "16": ["vertical_and_slash", 100, 800, 93167], "17": ["vertical_and_slash", 1000, 6096, 1464896], "18": ["vertical_and_slash", 1000, 6096, 434459], "19": ["vertical_and_slash", 30, 800, 1654521], "20": ["vertical_and_slash", 1000, 6096, 414], "21": ["vertical_and_slash", 1000, 6096, 76207], "22": ["vertical_and_slash", 1000, 6096, 8583], "23": ["vertical_and_slash", 1000, 6096, 1471], "24": ["vertical_and_slash", 1000, 6096, 231656], "25": ["vertical_and_slash", 500, 700, 95889], "26": ["vertical_and_slash", 30, 800, 62035], "27": ["vertical_and_slash", 1000, 6096, 43859], "28": ["vertical_and_slash", 30, 800, 23458], "29": ["vertical_and_slash", 30, 800, 53092], "30": ["vertical_and_slash", 30, 800, 74240], "31": ["vertical_and_slash", 30, 800, 45214]}, {"0": ["vertical_and_slash", 30, 800, 507], "1": ["vertical_and_slash", 100, 800, 8490], "2": ["vertical_and_slash", 100, 800, 3952118], "3": ["vertical_and_slash", 100, 800, 2475164], "4": ["vertical_and_slash", 100, 800, 8038], "5": ["vertical_and_slash", 30, 800, 2620494], "6": ["vertical_and_slash", 1000, 6096, 57306], "7": ["vertical_and_slash", 30, 800, 18889], "8": ["vertical_and_slash", 30, 800, 14900], "9": ["vertical_and_slash", 30, 800, 310453], "10": ["vertical_and_slash", 30, 800, 5494], "11": ["vertical_and_slash", 30, 800, 16096], "12": ["vertical_and_slash", 30, 800, 45897], "13": ["vertical_and_slash", 30, 800, 120295], "14": ["vertical_and_slash", 30, 800, 1446587], "15": ["vertical_and_slash", 30, 800, 133562], "16": ["vertical_and_slash", 30, 800, 81561], "17": ["vertical_and_slash", 100, 800, 1091558], "18": ["vertical_and_slash", 30, 800, 1104027], "19": ["vertical_and_slash", 30, 800, 95228], "20": ["vertical_and_slash", 1000, 6096, 81766], "21": ["vertical_and_slash", 1000, 6096, 1604474], "22": ["vertical_and_slash", 30, 800, 1720847], "23": ["vertical_and_slash", 30, 800, 254367], "24": ["vertical_and_slash", 1000, 6096, 69837], "25": ["vertical_and_slash", 1000, 6096, 1346498], "26": ["vertical_and_slash", 1000, 6096, 251707], "27": ["vertical_and_slash", 1000, 6096, 21055], "28": ["vertical_and_slash", 100, 800, 1310349], "29": ["vertical_and_slash", 1000, 6096, 523], "30": ["vertical_and_slash", 100, 800, 5], "31": ["vertical_and_slash", 1000, 6096, 4114]}, {"0": ["vertical_and_slash", 30, 800, 2076100], "1": ["vertical_and_slash", 30, 800, 742482], "2": ["vertical_and_slash", 30, 800, 84396], "3": ["vertical_and_slash", 100, 800, 6621015], "4": ["vertical_and_slash", 30, 800, 269671], "5": ["vertical_and_slash", 30, 800, 142041], "6": ["vertical_and_slash", 1000, 6096, 2493869], "7": ["vertical_and_slash", 1000, 6096, 2460341], "8": ["vertical_and_slash", 30, 800, 352690], "9": ["vertical_and_slash", 30, 800, 134441], "10": ["vertical_and_slash", 1000, 6096, 112278], "11": ["vertical_and_slash", 30, 800, 62933], "12": ["vertical_and_slash", 30, 800, 150459], "13": ["vertical_and_slash", 1000, 6096, 120036], "14": ["vertical_and_slash", 100, 800, 433238], "15": ["vertical_and_slash", 100, 800, 2723047], "16": ["vertical_and_slash", 1000, 6096, 112925], "17": ["vertical_and_slash", 1000, 6096, 23380], "18": ["vertical_and_slash", 1000, 6096, 92620], "19": ["vertical_and_slash", 1000, 6096, 37993], "20": ["vertical_and_slash", 100, 800, 74928], "21": ["vertical_and_slash", 3500, 100, 14191655], "22": ["vertical_and_slash", 1000, 6096, 514675], "23": ["vertical_and_slash", 100, 800, 9577073], "24": ["vertical_and_slash", 100, 800, 531136], "25": ["vertical_and_slash", 1000, 6096, 30007], "26": ["vertical_and_slash", 1000, 6096, 170687], "27": ["vertical_and_slash", 30, 800, 540287], "28": ["vertical_and_slash", 30, 800, 1435852], "29": ["vertical_and_slash", 30, 800, 948060], "30": ["vertical_and_slash", 1000, 6096, 37219], "31": ["vertical_and_slash", 1000, 6096, 211641]}, {"0": ["vertical_and_slash", 1000, 6096, 582795], "1": ["vertical_and_slash", 1000, 6096, 6289238], "2": ["vertical_and_slash", 1000, 6096, 570805], "3": ["vertical_and_slash", 1000, 6096, 198493], "4": ["vertical_and_slash", 30, 800, 112215], "5": ["vertical_and_slash", 30, 800, 5387246], "6": ["vertical_and_slash", 30, 800, 754350], "7": ["vertical_and_slash", 1000, 6096, 164737], "8": ["vertical_and_slash", 1000, 6096, 8597099], "9": ["vertical_and_slash", 1000, 6096, 13891466], "10": ["vertical_and_slash", 100, 800, 12184646], "11": ["vertical_and_slash", 1000, 6096, 3397834], "12": ["vertical_and_slash", 1000, 6096, 274297], "13": ["vertical_and_slash", 30, 800, 505818], "14": ["vertical_and_slash", 1000, 6096, 382749], "15": ["vertical_and_slash", 1000, 6096, 53485], "16": ["vertical_and_slash", 1000, 6096, 63748], "17": ["vertical_and_slash", 1000, 6096, 743437], "18": ["vertical_and_slash", 1000, 6096, 884226], "19": ["vertical_and_slash", 1000, 6096, 32754], "20": ["vertical_and_slash", 30, 800, 154807], "21": ["vertical_and_slash", 30, 800, 515833], "22": ["vertical_and_slash", 30, 800, 379827], "23": ["vertical_and_slash", 30, 800, 5140670], "24": ["vertical_and_slash", 1000, 6096, 8857], "25": ["vertical_and_slash", 1000, 6096, 9739], "26": ["vertical_and_slash", 1000, 6096, 3362559], "27": ["vertical_and_slash", 1000, 6096, 3602170], "28": ["vertical_and_slash", 1000, 6096, 286758], "29": ["vertical_and_slash", 1000, 6096, 1091568], "30": ["vertical_and_slash", 1000, 6096, 464410], "31": ["vertical_and_slash", 1000, 6096, 9113238]}, {"0": ["vertical_and_slash", 1000, 6096, 4112309], "1": ["vertical_and_slash", 1000, 6096, 6237157], "2": ["vertical_and_slash", 1000, 6096, 12411496], "3": ["vertical_and_slash", 1000, 6096, 3333545], "4": ["vertical_and_slash", 1000, 6096, 1082199], "5": ["vertical_and_slash", 1000, 6096, 3624535], "6": ["vertical_and_slash", 1000, 6096, 85587], "7": ["vertical_and_slash", 1000, 6096, 5060732], "8": ["vertical_and_slash", 30, 800, 981020], "9": ["vertical_and_slash", 30, 800, 647089], "10": ["vertical_and_slash", 30, 800, 1168497], "11": ["vertical_and_slash", 30, 800, 241811], "12": ["vertical_and_slash", 1000, 6096, 14258787], "13": ["vertical_and_slash", 1000, 6096, 13881708], "14": ["vertical_and_slash", 100, 800, 9807781], "15": ["vertical_and_slash", 1000, 6096, 11824390], "16": ["vertical_and_slash", 1000, 6096, 382173], "17": ["vertical_and_slash", 1000, 6096, 682553], "18": ["vertical_and_slash", 1000, 6096, 228115], "19": ["vertical_and_slash", 1000, 6096, 730935], "20": ["vertical_and_slash", 1000, 6096, 10237660], "21": ["vertical_and_slash", 1000, 6096, 210229], "22": ["vertical_and_slash", 1000, 6096, 4883397], "23": ["vertical_and_slash", 1000, 6096, 569329], "24": ["vertical_and_slash", 100, 800, 4152], "25": ["vertical_and_slash", 1000, 6096, 235235], "26": ["vertical_and_slash", 100, 800, 22473], "27": ["vertical_and_slash", 3500, 100, 14276508], "28": ["vertical_and_slash", 1000, 6096, 2277550], "29": ["vertical_and_slash", 1000, 6096, 1821096], "30": ["vertical_and_slash", 30, 800, 1212061], "31": ["vertical_and_slash", 1000, 6096, 13192107]}, {"0": ["vertical_and_slash", 1000, 6096, 812453], "1": ["vertical_and_slash", 1000, 6096, 6634405], "2": ["vertical_and_slash", 1000, 6096, 6896128], "3": ["vertical_and_slash", 1000, 6096, 12539813], "4": ["vertical_and_slash", 1000, 6096, 90867], "5": ["vertical_and_slash", 1000, 6096, 592412], "6": ["vertical_and_slash", 1000, 6096, 1863965], "7": ["vertical_and_slash", 1000, 6096, 1412714], "8": ["vertical_and_slash", 100, 800, 4723238], "9": ["vertical_and_slash", 30, 800, 73268], "10": ["vertical_and_slash", 1000, 6096, 522198], "11": ["vertical_and_slash", 30, 800, 144456], "12": ["vertical_and_slash", 1000, 6096, 218571], "13": ["vertical_and_slash", 1000, 6096, 4766244], "14": ["vertical_and_slash", 1000, 6096, 519409], "15": ["vertical_and_slash", 100, 800, 257427], "16": ["vertical_and_slash", 30, 800, 913307], "17": ["vertical_and_slash", 1000, 6096, 272105], "18": ["vertical_and_slash", 1000, 6096, 10253560], "19": ["vertical_and_slash", 1000, 6096, 103219], "20": ["vertical_and_slash", 1000, 6096, 825917], "21": ["vertical_and_slash", 1000, 6096, 1573906], "22": ["vertical_and_slash", 1000, 6096, 1401963], "23": ["vertical_and_slash", 1000, 6096, 903562], "24": ["vertical_and_slash", 1000, 6096, 116448], "25": ["vertical_and_slash", 500, 700, 10497021], "26": ["vertical_and_slash", 1000, 6096, 1451038], "27": ["vertical_and_slash", 100, 800, 9129837], "28": ["vertical_and_slash", 1000, 6096, 6069558], "29": ["vertical_and_slash", 100, 800, 4906900], "30": ["vertical_and_slash", 100, 800, 1935350], "31": ["vertical_and_slash", 1000, 6096, 13438131]}, {"0": ["vertical_and_slash", 1000, 6096, 200475], "1": ["vertical_and_slash", 1000, 6096, 2525357], "2": ["vertical_and_slash", 1000, 6096, 1581552], "3": ["vertical_and_slash", 1000, 6096, 1585962], "4": ["vertical_and_slash", 100, 800, 2468769], "5": ["vertical_and_slash", 1000, 6096, 2284149], "6": ["vertical_and_slash", 1000, 6096, 3954975], "7": ["vertical_and_slash", 1000, 6096, 12242517], "8": ["vertical_and_slash", 1000, 6096, 407981], "9": ["vertical_and_slash", 1000, 6096, 387918], "10": ["vertical_and_slash", 30, 800, 494970], "11": ["vertical_and_slash", 1000, 6096, 237593], "12": ["vertical_and_slash", 1000, 6096, 13227100], "13": ["vertical_and_slash", 1000, 6096, 7150283], "14": ["vertical_and_slash", 1000, 6096, 1460829], "15": ["vertical_and_slash", 1000, 6096, 5830515], "16": ["vertical_and_slash", 30, 800, 321990], "17": ["vertical_and_slash", 500, 700, 412885], "18": ["vertical_and_slash", 30, 800, 7754087], "19": ["vertical_and_slash", 30, 800, 593222], "20": ["vertical_and_slash", 1000, 6096, 9430066], "21": ["vertical_and_slash", 1000, 6096, 11445545], "22": ["vertical_and_slash", 1000, 6096, 10096832], "23": ["vertical_and_slash", 1000, 6096, 11108827], "24": ["vertical_and_slash", 1000, 6096, 2040566], "25": ["vertical_and_slash", 1000, 6096, 1293645], "26": ["vertical_and_slash", 1000, 6096, 1681146], "27": ["vertical_and_slash", 1000, 6096, 1621078], "28": ["vertical_and_slash", 3500, 100, 14482863], "29": ["vertical_and_slash", 3500, 100, 14306340], "30": ["vertical_and_slash", 3500, 100, 14736032], "31": ["vertical_and_slash", 30, 800, 59474]}, {"0": ["vertical_and_slash", 30, 800, 2015977], "1": ["vertical_and_slash", 1000, 6096, 1851908], "2": ["vertical_and_slash", 500, 700, 3019045], "3": ["vertical_and_slash", 30, 800, 2275137], "4": ["vertical_and_slash", 1000, 6096, 111007], "5": ["vertical_and_slash", 1000, 6096, 74876], "6": ["vertical_and_slash", 1000, 6096, 291657], "7": ["vertical_and_slash", 1000, 6096, 72059], "8": ["vertical_and_slash", 100, 800, 4966732], "9": ["vertical_and_slash", 30, 800, 1227926], "10": ["vertical_and_slash", 1000, 6096, 817635], "11": ["vertical_and_slash", 100, 800, 1996081], "12": ["vertical_and_slash", 30, 800, 320794], "13": ["vertical_and_slash", 30, 800, 641018], "14": ["vertical_and_slash", 1000, 6096, 784584], "15": ["vertical_and_slash", 500, 700, 615730], "16": ["vertical_and_slash", 30, 800, 130637], "17": ["vertical_and_slash", 500, 700, 237719], "18": ["vertical_and_slash", 30, 800, 484009], "19": ["vertical_and_slash", 30, 800, 71667], "20": ["vertical_and_slash", 30, 800, 6034932], "21": ["vertical_and_slash", 30, 800, 279606], "22": ["vertical_and_slash", 30, 800, 273046], "23": ["vertical_and_slash", 500, 700, 5343396], "24": ["vertical_and_slash", 30, 800, 424419], "25": ["vertical_and_slash", 30, 800, 268585], "26": ["vertical_and_slash", 500, 700, 469509], "27": ["vertical_and_slash", 30, 800, 1150183], "28": ["vertical_and_slash", 30, 800, 567665], "29": ["vertical_and_slash", 30, 800, 689969], "30": ["vertical_and_slash", 30, 800, 3124447], "31": ["vertical_and_slash", 500, 700, 1311816]}, {"0": ["vertical_and_slash", 1000, 6096, 13054849], "1": ["vertical_and_slash", 1000, 6096, 11676492], "2": ["vertical_and_slash", 1000, 6096, 13662962], "3": ["vertical_and_slash", 1000, 6096, 13009510], "4": ["vertical_and_slash", 1000, 6096, 13228770], "5": ["vertical_and_slash", 1000, 6096, 13738897], "6": ["vertical_and_slash", 1000, 6096, 4327684], "7": ["vertical_and_slash", 100, 800, 1780647], "8": ["vertical_and_slash", 1000, 6096, 12984525], "9": ["vertical_and_slash", 1000, 6096, 10106452], "10": ["vertical_and_slash", 1000, 6096, 13121645], "11": ["vertical_and_slash", 1000, 6096, 7143877], "12": ["vertical_and_slash", 1000, 6096, 1302273], "13": ["vertical_and_slash", 1000, 6096, 12189960], "14": ["vertical_and_slash", 1000, 6096, 10369892], "15": ["vertical_and_slash", 1000, 6096, 6251432], "16": ["vertical_and_slash", 1000, 6096, 13767358], "17": ["vertical_and_slash", 1000, 6096, 14264179], "18": ["vertical_and_slash", 1000, 6096, 14027354], "19": ["vertical_and_slash", 1000, 6096, 12810299], "20": ["vertical_and_slash", 1000, 6096, 11500719], "21": ["vertical_and_slash", 1000, 6096, 8729013], "22": ["vertical_and_slash", 100, 800, 1386474], "23": ["vertical_and_slash", 1000, 6096, 8809015], "24": ["vertical_and_slash", 30, 800, 1192385], "25": ["vertical_and_slash", 100, 800, 6597145], "26": ["vertical_and_slash", 100, 800, 11801029], "27": ["vertical_and_slash", 1000, 6096, 981847], "28": ["vertical_and_slash", 1000, 6096, 3790181], "29": ["vertical_and_slash", 30, 800, 1641474], "30": ["vertical_and_slash", 1000, 6096, 4214917], "31": ["vertical_and_slash", 1000, 6096, 3423871]}, {"0": ["vertical_and_slash", 1000, 6096, 7281028], "1": ["vertical_and_slash", 1000, 6096, 6327889], "2": ["vertical_and_slash", 1000, 6096, 5161807], "3": ["vertical_and_slash", 1000, 6096, 6944365], "4": ["vertical_and_slash", 1000, 6096, 10798408], "5": ["vertical_and_slash", 1000, 6096, 11848526], "6": ["vertical_and_slash", 1000, 6096, 5023703], "7": ["vertical_and_slash", 1000, 6096, 6869756], "8": ["vertical_and_slash", 30, 800, 2070673], "9": ["vertical_and_slash", 30, 800, 2108039], "10": ["vertical_and_slash", 30, 800, 2478923], "11": ["vertical_and_slash", 30, 800, 1062019], "12": ["vertical_and_slash", 1000, 6096, 10483422], "13": ["vertical_and_slash", 1000, 6096, 13220734], "14": ["vertical_and_slash", 1000, 6096, 10864461], "15": ["vertical_and_slash", 1000, 6096, 10380263], "16": ["vertical_and_slash", 1000, 6096, 12606664], "17": ["vertical_and_slash", 1000, 6096, 12755695], "18": ["vertical_and_slash", 1000, 6096, 14481440], "19": ["vertical_and_slash", 1000, 6096, 12125755], "20": ["vertical_and_slash", 1000, 6096, 13727938], "21": ["vertical_and_slash", 100, 800, 9986525], "22": ["vertical_and_slash", 1000, 6096, 13802294], "23": ["vertical_and_slash", 1000, 6096, 8589854], "24": ["vertical_and_slash", 1000, 6096, 8696624], "25": ["vertical_and_slash", 1000, 6096, 6711141], "26": ["vertical_and_slash", 30, 800, 11407], "27": ["vertical_and_slash", 1000, 6096, 10286733], "28": ["vertical_and_slash", 100, 800, 14346519], "29": ["vertical_and_slash", 3500, 100, 14822370], "30": ["vertical_and_slash", 1000, 6096, 13996996], "31": ["vertical_and_slash", 3500, 100, 13837843]}, {"0": ["vertical_and_slash", 30, 800, 187826], "1": ["vertical_and_slash", 1000, 6096, 319682], "2": ["vertical_and_slash", 1000, 6096, 717971], "3": ["vertical_and_slash", 1000, 6096, 12248225], "4": ["vertical_and_slash", 30, 800, 2311494], "5": ["vertical_and_slash", 1000, 6096, 354949], "6": ["vertical_and_slash", 30, 800, 2723442], "7": ["vertical_and_slash", 30, 800, 217627], "8": ["vertical_and_slash", 500, 700, 1800505], "9": ["vertical_and_slash", 30, 800, 5395314], "10": ["vertical_and_slash", 30, 800, 10715415], "11": ["vertical_and_slash", 100, 800, 13267898], "12": ["vertical_and_slash", 30, 800, 282819], "13": ["vertical_and_slash", 1000, 6096, 8417130], "14": ["vertical_and_slash", 1000, 6096, 5380564], "15": ["vertical_and_slash", 1000, 6096, 9802765], "16": ["vertical_and_slash", 1000, 6096, 385044], "17": ["vertical_and_slash", 1000, 6096, 2048601], "18": ["vertical_and_slash", 1000, 6096, 2798283], "19": ["vertical_and_slash", 100, 800, 11985153], "20": ["vertical_and_slash", 1000, 6096, 9560488], "21": ["vertical_and_slash", 1000, 6096, 8719957], "22": ["vertical_and_slash", 1000, 6096, 10883722], "23": ["vertical_and_slash", 1000, 6096, 11184293], "24": ["vertical_and_slash", 1000, 6096, 5049287], "25": ["vertical_and_slash", 1000, 6096, 6119952], "26": ["vertical_and_slash", 1000, 6096, 11948638], "27": ["vertical_and_slash", 1000, 6096, 4654529], "28": ["vertical_and_slash", 1000, 6096, 269543], "29": ["vertical_and_slash", 1000, 6096, 1183543], "30": ["vertical_and_slash", 1000, 6096, 4018748], "31": ["vertical_and_slash", 30, 800, 208750]}, {"0": ["vertical_and_slash", 3500, 100, 14712977], "1": ["vertical_and_slash", 1000, 6096, 7977346], "2": ["vertical_and_slash", 100, 800, 12022826], "3": ["vertical_and_slash", 100, 800, 7525648], "4": ["vertical_and_slash", 500, 700, 627445], "5": ["vertical_and_slash", 1000, 6096, 1067661], "6": ["vertical_and_slash", 500, 700, 199111], "7": ["vertical_and_slash", 100, 800, 1462908], "8": ["vertical_and_slash", 1000, 6096, 12608289], "9": ["vertical_and_slash", 1000, 6096, 3815760], "10": ["vertical_and_slash", 100, 800, 5050623], "11": ["vertical_and_slash", 3500, 100, 6790875], "12": ["vertical_and_slash", 30, 800, 284918], "13": ["vertical_and_slash", 500, 700, 277887], "14": ["vertical_and_slash", 500, 700, 236664], "15": ["vertical_and_slash", 30, 800, 3582148], "16": ["vertical_and_slash", 100, 800, 13373963], "17": ["vertical_and_slash", 100, 800, 682950], "18": ["vertical_and_slash", 1000, 6096, 7136486], "19": ["vertical_and_slash", 1000, 6096, 13769505], "20": ["vertical_and_slash", 1000, 6096, 9883913], "21": ["vertical_and_slash", 1000, 6096, 10833503], "22": ["vertical_and_slash", 30, 800, 62940], "23": ["vertical_and_slash", 1000, 6096, 4652762], "24": ["vertical_and_slash", 1000, 6096, 5480379], "25": ["vertical_and_slash", 3500, 100, 14131887], "26": ["vertical_and_slash", 100, 800, 9221283], "27": ["vertical_and_slash", 1000, 6096, 4197162], "28": ["vertical_and_slash", 30, 800, 4438611], "29": ["vertical_and_slash", 30, 800, 354648], "30": ["vertical_and_slash", 30, 800, 7285775], "31": ["vertical_and_slash", 30, 800, 4392079]}, {"0": ["vertical_and_slash", 1000, 6096, 2131686], "1": ["vertical_and_slash", 1000, 6096, 3609919], "2": ["vertical_and_slash", 1000, 6096, 899481], "3": ["vertical_and_slash", 100, 800, 3219776], "4": ["vertical_and_slash", 3500, 100, 11460535], "5": ["vertical_and_slash", 1000, 6096, 154336], "6": ["vertical_and_slash", 3500, 100, 14438950], "7": ["vertical_and_slash", 100, 800, 6652113], "8": ["vertical_and_slash", 100, 800, 9133667], "9": ["vertical_and_slash", 100, 800, 8048731], "10": ["vertical_and_slash", 1000, 6096, 528931], "11": ["vertical_and_slash", 30, 800, 2635938], "12": ["vertical_and_slash", 30, 800, 8546455], "13": ["vertical_and_slash", 500, 700, 7229697], "14": ["vertical_and_slash", 1000, 6096, 32195], "15": ["vertical_and_slash", 1000, 6096, 230534], "16": ["vertical_and_slash", 100, 800, 2475909], "17": ["vertical_and_slash", 30, 800, 2484470], "18": ["vertical_and_slash", 100, 800, 8168145], "19": ["vertical_and_slash", 3500, 100, 6348588], "20": ["vertical_and_slash", 500, 700, 290337], "21": ["vertical_and_slash", 3500, 100, 12830116], "22": ["vertical_and_slash", 100, 800, 11406972], "23": ["vertical_and_slash", 1000, 6096, 9663426], "24": ["vertical_and_slash", 3500, 100, 14333500], "25": ["vertical_and_slash", 3500, 100, 14787732], "26": ["vertical_and_slash", 1000, 6096, 13209856], "27": ["vertical_and_slash", 100, 800, 14623240], "28": ["vertical_and_slash", 1000, 6096, 6321698], "29": ["vertical_and_slash", 1000, 6096, 10324255], "30": ["vertical_and_slash", 100, 800, 1338], "31": ["vertical_and_slash", 1000, 6096, 5182275]}, {"0": ["vertical_and_slash", 100, 800, 2653574], "1": ["vertical_and_slash", 1000, 6096, 156404], "2": ["vertical_and_slash", 1000, 6096, 3288754], "3": ["vertical_and_slash", 1000, 6096, 597358], "4": ["vertical_and_slash", 1000, 6096, 13162000], "5": ["vertical_and_slash", 100, 800, 3304599], "6": ["vertical_and_slash", 100, 800, 2334228], "7": ["vertical_and_slash", 30, 800, 151547], "8": ["vertical_and_slash", 1000, 6096, 8084555], "9": ["vertical_and_slash", 1000, 6096, 6986695], "10": ["vertical_and_slash", 30, 800, 1349542], "11": ["vertical_and_slash", 1000, 6096, 62139], "12": ["vertical_and_slash", 500, 700, 586215], "13": ["vertical_and_slash", 30, 800, 3339401], "14": ["vertical_and_slash", 500, 700, 9080591], "15": ["vertical_and_slash", 100, 800, 1860621], "16": ["vertical_and_slash", 1000, 6096, 11577402], "17": ["vertical_and_slash", 1000, 6096, 6483036], "18": ["vertical_and_slash", 1000, 6096, 10223119], "19": ["vertical_and_slash", 1000, 6096, 2516899], "20": ["vertical_and_slash", 100, 800, 14689692], "21": ["vertical_and_slash", 1000, 6096, 9574317], "22": ["vertical_and_slash", 1000, 6096, 14315469], "23": ["vertical_and_slash", 1000, 6096, 11084722], "24": ["vertical_and_slash", 30, 800, 5714332], "25": ["vertical_and_slash", 30, 800, 440501], "26": ["vertical_and_slash", 30, 800, 135011], "27": ["vertical_and_slash", 100, 800, 1143711], "28": ["vertical_and_slash", 1000, 6096, 10833817], "29": ["vertical_and_slash", 100, 800, 9389405], "30": ["vertical_and_slash", 1000, 6096, 7182171], "31": ["vertical_and_slash", 1000, 6096, 3116752]}, {"0": ["vertical_and_slash", 1000, 6096, 2272762], "1": ["vertical_and_slash", 100, 800, 9251901], "2": ["vertical_and_slash", 1000, 6096, 3172792], "3": ["vertical_and_slash", 1000, 6096, 11166637], "4": ["vertical_and_slash", 1000, 6096, 267179], "5": ["vertical_and_slash", 100, 800, 1956945], "6": ["vertical_and_slash", 1000, 6096, 431457], "7": ["vertical_and_slash", 100, 800, 215074], "8": ["vertical_and_slash", 30, 800, 160167], "9": ["vertical_and_slash", 1000, 6096, 13251530], "10": ["vertical_and_slash", 100, 800, 1045212], "11": ["vertical_and_slash", 1000, 6096, 7767754], "12": ["vertical_and_slash", 100, 800, 8430862], "13": ["vertical_and_slash", 100, 800, 12275346], "14": ["vertical_and_slash", 1000, 6096, 12967454], "15": ["vertical_and_slash", 1000, 6096, 776792], "16": ["vertical_and_slash", 30, 800, 4940981], "17": ["vertical_and_slash", 1000, 6096, 4687476], "18": ["vertical_and_slash", 30, 800, 3396568], "19": ["vertical_and_slash", 1000, 6096, 6330177], "20": ["vertical_and_slash", 100, 800, 10772100], "21": ["vertical_and_slash", 1000, 6096, 431927], "22": ["vertical_and_slash", 100, 800, 5368777], "23": ["vertical_and_slash", 100, 800, 11971880], "24": ["vertical_and_slash", 1000, 6096, 3355141], "25": ["vertical_and_slash", 30, 800, 7775685], "26": ["vertical_and_slash", 1000, 6096, 17862], "27": ["vertical_and_slash", 1000, 6096, 2368170], "28": ["vertical_and_slash", 1000, 6096, 887652], "29": ["vertical_and_slash", 1000, 6096, 342019], "30": ["vertical_and_slash", 1000, 6096, 2031], "31": ["vertical_and_slash", 100, 800, 851845]}, {"0": ["vertical_and_slash", 1000, 6096, 9577296], "1": ["vertical_and_slash", 1000, 6096, 6130994], "2": ["vertical_and_slash", 1000, 6096, 932158], "3": ["vertical_and_slash", 1000, 6096, 6193523], "4": ["vertical_and_slash", 30, 800, 4212495], "5": ["vertical_and_slash", 1000, 6096, 82539], "6": ["vertical_and_slash", 1000, 6096, 2033854], "7": ["vertical_and_slash", 100, 800, 973812], "8": ["vertical_and_slash", 1000, 6096, 96691], "9": ["vertical_and_slash", 1000, 6096, 7346123], "10": ["vertical_and_slash", 1000, 6096, 3425225], "11": ["vertical_and_slash", 1000, 6096, 5656378], "12": ["vertical_and_slash", 1000, 6096, 13585373], "13": ["vertical_and_slash", 3500, 100, 12228455], "14": ["vertical_and_slash", 100, 800, 14994473], "15": ["vertical_and_slash", 1000, 6096, 12825284], "16": ["vertical_and_slash", 1000, 6096, 8256], "17": ["vertical_and_slash", 1000, 6096, 287798], "18": ["vertical_and_slash", 1000, 6096, 3485339], "19": ["vertical_and_slash", 1000, 6096, 4049013], "20": ["vertical_and_slash", 1000, 6096, 10172329], "21": ["vertical_and_slash", 100, 800, 70376], "22": ["vertical_and_slash", 500, 700, 624964], "23": ["vertical_and_slash", 1000, 6096, 7478718], "24": ["vertical_and_slash", 1000, 6096, 11234418], "25": ["vertical_and_slash", 100, 800, 12774404], "26": ["vertical_and_slash", 1000, 6096, 10820183], "27": ["vertical_and_slash", 1000, 6096, 8669939], "28": ["vertical_and_slash", 100, 800, 46], "29": ["vertical_and_slash", 30, 800, 2478], "30": ["vertical_and_slash", 1000, 6096, 343890], "31": ["vertical_and_slash", 1000, 6096, 485618]}, {"0": ["vertical_and_slash", 1000, 6096, 2552], "1": ["vertical_and_slash", 1000, 6096, 3940587], "2": ["vertical_and_slash", 1000, 6096, 2070936], "3": ["vertical_and_slash", 1000, 6096, 232875], "4": ["vertical_and_slash", 30, 800, 751140], "5": ["vertical_and_slash", 100, 800, 231769], "6": ["vertical_and_slash", 30, 800, 2274515], "7": ["vertical_and_slash", 30, 800, 989564], "8": ["vertical_and_slash", 3500, 100, 14768346], "9": ["vertical_and_slash", 30, 800, 1208594], "10": ["vertical_and_slash", 30, 800, 1770328], "11": ["vertical_and_slash", 1000, 6096, 8752930], "12": ["vertical_and_slash", 3500, 100, 46312], "13": ["vertical_and_slash", 100, 800, 289542], "14": ["vertical_and_slash", 3500, 100, 306397], "15": ["vertical_and_slash", 3500, 100, 56350], "16": ["vertical_and_slash", 100, 800, 356204], "17": ["vertical_and_slash", 3500, 100, 1500240], "18": ["vertical_and_slash", 1000, 6096, 150152], "19": ["vertical_and_slash", 100, 800, 101799], "20": ["vertical_and_slash", 1000, 6096, 299393], "21": ["vertical_and_slash", 1000, 6096, 8627429], "22": ["vertical_and_slash", 1000, 6096, 3529325], "23": ["vertical_and_slash", 1000, 6096, 1448873], "24": ["vertical_and_slash", 1000, 6096, 1712901], "25": ["vertical_and_slash", 500, 700, 4048433], "26": ["vertical_and_slash", 1000, 6096, 3837844], "27": ["vertical_and_slash", 1000, 6096, 5399791], "28": ["vertical_and_slash", 1000, 6096, 5525857], "29": ["vertical_and_slash", 1000, 6096, 4847570], "30": ["vertical_and_slash", 1000, 6096, 7521944], "31": ["vertical_and_slash", 1000, 6096, 6944849]}, {"0": ["vertical_and_slash", 3500, 100, 12061195], "1": ["vertical_and_slash", 3500, 100, 13821114], "2": ["vertical_and_slash", 1000, 6096, 11831232], "3": ["vertical_and_slash", 1000, 6096, 1990608], "4": ["vertical_and_slash", 1000, 6096, 1126789], "5": ["vertical_and_slash", 1000, 6096, 164058], "6": ["vertical_and_slash", 1000, 6096, 1546250], "7": ["vertical_and_slash", 3500, 100, 3197616], "8": ["vertical_and_slash", 1000, 6096, 4347461], "9": ["vertical_and_slash", 100, 800, 6182587], "10": ["vertical_and_slash", 100, 800, 344594], "11": ["vertical_and_slash", 100, 800, 4476113], "12": ["vertical_and_slash", 1000, 6096, 13461002], "13": ["vertical_and_slash", 1000, 6096, 10764088], "14": ["vertical_and_slash", 1000, 6096, 12256526], "15": ["vertical_and_slash", 1000, 6096, 13680456], "16": ["vertical_and_slash", 30, 800, 247807], "17": ["vertical_and_slash", 30, 800, 283870], "18": ["vertical_and_slash", 30, 800, 8225577], "19": ["vertical_and_slash", 30, 800, 448632], "20": ["vertical_and_slash", 1000, 6096, 4175564], "21": ["vertical_and_slash", 1000, 6096, 2726117], "22": ["vertical_and_slash", 1000, 6096, 310838], "23": ["vertical_and_slash", 1000, 6096, 204919], "24": ["vertical_and_slash", 30, 800, 875524], "25": ["vertical_and_slash", 30, 800, 1182277], "26": ["vertical_and_slash", 30, 800, 4252580], "27": ["vertical_and_slash", 100, 800, 728402], "28": ["vertical_and_slash", 1000, 6096, 12755775], "29": ["vertical_and_slash", 1000, 6096, 13455097], "30": ["vertical_and_slash", 100, 800, 10492805], "31": ["vertical_and_slash", 3500, 100, 11957996]}, {"0": ["vertical_and_slash", 500, 700, 386640], "1": ["vertical_and_slash", 100, 800, 819517], "2": ["vertical_and_slash", 30, 800, 1170984], "3": ["vertical_and_slash", 100, 800, 626489], "4": ["vertical_and_slash", 1000, 6096, 5856605], "5": ["vertical_and_slash", 1000, 6096, 12960788], "6": ["vertical_and_slash", 1000, 6096, 13042017], "7": ["vertical_and_slash", 1000, 6096, 12542120], "8": ["vertical_and_slash", 1000, 6096, 24167], "9": ["vertical_and_slash", 100, 800, 440430], "10": ["vertical_and_slash", 3500, 100, 748759], "11": ["vertical_and_slash", 1000, 6096, 4655], "12": ["vertical_and_slash", 1000, 6096, 10739360], "13": ["vertical_and_slash", 1000, 6096, 9336615], "14": ["vertical_and_slash", 3500, 100, 14305575], "15": ["vertical_and_slash", 3500, 100, 13833292], "16": ["vertical_and_slash", 30, 800, 3412], "17": ["vertical_and_slash", 500, 700, 16614], "18": ["vertical_and_slash", 1000, 6096, 839930], "19": ["vertical_and_slash", 500, 700, 77296], "20": ["vertical_and_slash", 1000, 6096, 11148082], "21": ["vertical_and_slash", 100, 800, 2483383], "22": ["vertical_and_slash", 3500, 100, 11902907], "23": ["vertical_and_slash", 100, 800, 2194], "24": ["vertical_and_slash", 1000, 6096, 4441496], "25": ["vertical_and_slash", 3500, 100, 10827107], "26": ["vertical_and_slash", 100, 800, 105753], "27": ["vertical_and_slash", 1000, 6096, 5261357], "28": ["vertical_and_slash", 30, 800, 61603], "29": ["vertical_and_slash", 30, 800, 108480], "30": ["vertical_and_slash", 30, 800, 30219], "31": ["vertical_and_slash", 30, 800, 31426]}, {"0": ["vertical_and_slash", 1000, 6096, 136760], "1": ["vertical_and_slash", 100, 800, 827733], "2": ["vertical_and_slash", 100, 800, 670059], "3": ["vertical_and_slash", 3500, 100, 502020], "4": ["vertical_and_slash", 100, 800, 469444], "5": ["vertical_and_slash", 100, 800, 162670], "6": ["vertical_and_slash", 1000, 6096, 22310], "7": ["vertical_and_slash", 1000, 6096, 465], "8": ["vertical_and_slash", 30, 800, 951054], "9": ["vertical_and_slash", 30, 800, 799102], "10": ["vertical_and_slash", 30, 800, 936020], "11": ["vertical_and_slash", 30, 800, 2027181], "12": ["vertical_and_slash", 3500, 100, 5986265], "13": ["vertical_and_slash", 500, 700, 3941412], "14": ["vertical_and_slash", 100, 800, 10557303], "15": ["vertical_and_slash", 100, 800, 1533916], "16": ["vertical_and_slash", 3500, 100, 11870953], "17": ["vertical_and_slash", 3500, 100, 12342581], "18": ["vertical_and_slash", 3500, 100, 12699180], "19": ["vertical_and_slash", 1000, 6096, 5138869], "20": ["vertical_and_slash", 1000, 6096, 12477033], "21": ["vertical_and_slash", 1000, 6096, 872144], "22": ["vertical_and_slash", 3500, 100, 13382501], "23": ["vertical_and_slash", 1000, 6096, 11531397], "24": ["vertical_and_slash", 1000, 6096, 13884364], "25": ["vertical_and_slash", 1000, 6096, 13611635], "26": ["vertical_and_slash", 1000, 6096, 13516676], "27": ["vertical_and_slash", 1000, 6096, 12560863], "28": ["vertical_and_slash", 500, 700, 3865996], "29": ["vertical_and_slash", 30, 800, 3343532], "30": ["vertical_and_slash", 30, 800, 179777], "31": ["vertical_and_slash", 3500, 100, 3863085]}, {"0": ["vertical_and_slash", 3500, 100, 6771823], "1": ["vertical_and_slash", 3500, 100, 10770780], "2": ["vertical_and_slash", 1000, 6096, 108476], "3": ["vertical_and_slash", 1000, 6096, 917033], "4": ["vertical_and_slash", 3500, 100, 9994951], "5": ["vertical_and_slash", 3500, 100, 13503132], "6": ["vertical_and_slash", 3500, 100, 11843766], "7": ["vertical_and_slash", 3500, 100, 10714999], "8": ["vertical_and_slash", 100, 800, 650037], "9": ["vertical_and_slash", 30, 800, 321924], "10": ["vertical_and_slash", 100, 800, 306681], "11": ["vertical_and_slash", 100, 800, 76181], "12": ["vertical_and_slash", 3500, 100, 12194592], "13": ["vertical_and_slash", 1000, 6096, 12635491], "14": ["vertical_and_slash", 3500, 100, 11953805], "15": ["vertical_and_slash", 3500, 100, 12355730], "16": ["vertical_and_slash", 100, 800, 614284], "17": ["vertical_and_slash", 100, 800, 512751], "18": ["vertical_and_slash", 3500, 100, 2679940], "19": ["vertical_and_slash", 100, 800, 1749683], "20": ["vertical_and_slash", 30, 800, 563622], "21": ["vertical_and_slash", 30, 800, 9985639], "22": ["vertical_and_slash", 30, 800, 1055029], "23": ["vertical_and_slash", 30, 800, 501782], "24": ["vertical_and_slash", 30, 800, 68229], "25": ["vertical_and_slash", 100, 800, 211743], "26": ["vertical_and_slash", 100, 800, 1690702], "27": ["vertical_and_slash", 30, 800, 2720080], "28": ["vertical_and_slash", 30, 800, 3884686], "29": ["vertical_and_slash", 30, 800, 3303748], "30": ["vertical_and_slash", 30, 800, 3335960], "31": ["vertical_and_slash", 30, 800, 2469116]}, {"0": ["vertical_and_slash", 1000, 6096, 726797], "1": ["vertical_and_slash", 100, 800, 5833160], "2": ["vertical_and_slash", 1000, 6096, 1766748], "3": ["vertical_and_slash", 1000, 6096, 6021028], "4": ["vertical_and_slash", 1000, 6096, 3120126], "5": ["vertical_and_slash", 30, 800, 3103142], "6": ["vertical_and_slash", 1000, 6096, 22974], "7": ["vertical_and_slash", 1000, 6096, 616209], "8": ["vertical_and_slash", 100, 800, 5571258], "9": ["vertical_and_slash", 30, 800, 2259315], "10": ["vertical_and_slash", 1000, 6096, 438342], "11": ["vertical_and_slash", 100, 800, 5557528], "12": ["vertical_and_slash", 3500, 100, 12954645], "13": ["vertical_and_slash", 1000, 6096, 12677660], "14": ["vertical_and_slash", 3500, 100, 13038925], "15": ["vertical_and_slash", 1000, 6096, 11239328], "16": ["vertical_and_slash", 3500, 100, 5247646], "17": ["vertical_and_slash", 500, 700, 384866], "18": ["vertical_and_slash", 1000, 6096, 655131], "19": ["vertical_and_slash", 3500, 100, 8826025], "20": ["vertical_and_slash", 100, 800, 4478606], "21": ["vertical_and_slash", 100, 800, 3881052], "22": ["vertical_and_slash", 100, 800, 6027887], "23": ["vertical_and_slash", 3500, 100, 8475077], "24": ["vertical_and_slash", 1000, 6096, 103633], "25": ["vertical_and_slash", 1000, 6096, 76484], "26": ["vertical_and_slash", 100, 800, 22432], "27": ["vertical_and_slash", 1000, 6096, 1313063], "28": ["vertical_and_slash", 1000, 6096, 6617078], "29": ["vertical_and_slash", 3500, 100, 12355842], "30": ["vertical_and_slash", 100, 800, 1401085], "31": ["vertical_and_slash", 3500, 100, 11350169]}, {"0": ["vertical_and_slash", 100, 800, 142456], "1": ["vertical_and_slash", 500, 700, 290481], "2": ["vertical_and_slash", 30, 800, 195338], "3": ["vertical_and_slash", 30, 800, 235375], "4": ["vertical_and_slash", 3500, 100, 13220328], "5": ["vertical_and_slash", 1000, 6096, 13040738], "6": ["vertical_and_slash", 3500, 100, 14847993], "7": ["vertical_and_slash", 1000, 6096, 12236451], "8": ["vertical_and_slash", 30, 800, 1360565], "9": ["vertical_and_slash", 30, 800, 115757], "10": ["vertical_and_slash", 30, 800, 806615], "11": ["vertical_and_slash", 30, 800, 5655605], "12": ["vertical_and_slash", 1000, 6096, 803465], "13": ["vertical_and_slash", 1000, 6096, 7601845], "14": ["vertical_and_slash", 30, 800, 8869563], "15": ["vertical_and_slash", 100, 800, 9177143], "16": ["vertical_and_slash", 1000, 6096, 612999], "17": ["vertical_and_slash", 100, 800, 2657352], "18": ["vertical_and_slash", 1000, 6096, 297015], "19": ["vertical_and_slash", 1000, 6096, 309571], "20": ["vertical_and_slash", 1000, 6096, 13160644], "21": ["vertical_and_slash", 1000, 6096, 14006964], "22": ["vertical_and_slash", 3500, 100, 14287913], "23": ["vertical_and_slash", 3500, 100, 14586379], "24": ["vertical_and_slash", 1000, 6096, 12023244], "25": ["vertical_and_slash", 30, 800, 12092108], "26": ["vertical_and_slash", 500, 700, 6005169], "27": ["vertical_and_slash", 500, 700, 9574963], "28": ["vertical_and_slash", 1000, 6096, 1696021], "29": ["vertical_and_slash", 30, 800, 1516298], "30": ["vertical_and_slash", 1000, 6096, 2303483], "31": ["vertical_and_slash", 1000, 6096, 903636]}, {"0": ["vertical_and_slash", 3500, 100, 7496361], "1": ["vertical_and_slash", 30, 800, 571560], "2": ["vertical_and_slash", 100, 800, 3025676], "3": ["vertical_and_slash", 30, 800, 5167076], "4": ["vertical_and_slash", 30, 800, 501453], "5": ["vertical_and_slash", 30, 800, 342659], "6": ["vertical_and_slash", 30, 800, 2561588], "7": ["vertical_and_slash", 30, 800, 869660], "8": ["vertical_and_slash", 100, 800, 10740412], "9": ["vertical_and_slash", 30, 800, 87115], "10": ["vertical_and_slash", 3500, 100, 9800623], "11": ["vertical_and_slash", 30, 800, 9191448], "12": ["vertical_and_slash", 1000, 6096, 289817], "13": ["vertical_and_slash", 3500, 100, 9009480], "14": ["vertical_and_slash", 1000, 6096, 1799625], "15": ["vertical_and_slash", 1000, 6096, 4984031], "16": ["vertical_and_slash", 3500, 100, 3381538], "17": ["vertical_and_slash", 100, 800, 11456778], "18": ["vertical_and_slash", 3500, 100, 14316760], "19": ["vertical_and_slash", 100, 800, 5228661], "20": ["vertical_and_slash", 3500, 100, 5831971], "21": ["vertical_and_slash", 500, 700, 10184028], "22": ["vertical_and_slash", 30, 800, 578221], "23": ["vertical_and_slash", 3500, 100, 6213253], "24": ["vertical_and_slash", 1000, 6096, 6146366], "25": ["vertical_and_slash", 1000, 6096, 1477166], "26": ["vertical_and_slash", 30, 800, 318810], "27": ["vertical_and_slash", 1000, 6096, 8654738], "28": ["vertical_and_slash", 500, 700, 3294065], "29": ["vertical_and_slash", 100, 800, 8531992], "30": ["vertical_and_slash", 100, 800, 2564233], "31": ["vertical_and_slash", 100, 800, 113957]}, {"0": ["vertical_and_slash", 100, 800, 530019], "1": ["vertical_and_slash", 100, 800, 647580], "2": ["vertical_and_slash", 30, 800, 4990437], "3": ["vertical_and_slash", 30, 800, 317415], "4": ["vertical_and_slash", 100, 800, 365956], "5": ["vertical_and_slash", 100, 800, 1689094], "6": ["vertical_and_slash", 100, 800, 454281], "7": ["vertical_and_slash", 30, 800, 266331], "8": ["vertical_and_slash", 3500, 100, 3603593], "9": ["vertical_and_slash", 100, 800, 14614370], "10": ["vertical_and_slash", 1000, 6096, 5361097], "11": ["vertical_and_slash", 100, 800, 14371859], "12": ["vertical_and_slash", 30, 800, 1232558], "13": ["vertical_and_slash", 30, 800, 546028], "14": ["vertical_and_slash", 30, 800, 853313], "15": ["vertical_and_slash", 30, 800, 194933], "16": ["vertical_and_slash", 3500, 100, 14304381], "17": ["vertical_and_slash", 1000, 6096, 815541], "18": ["vertical_and_slash", 100, 800, 5138518], "19": ["vertical_and_slash", 3500, 100, 9565094], "20": ["vertical_and_slash", 1000, 6096, 2035169], "21": ["vertical_and_slash", 1000, 6096, 3375423], "22": ["vertical_and_slash", 1000, 6096, 3777615], "23": ["vertical_and_slash", 1000, 6096, 12354929], "24": ["vertical_and_slash", 30, 800, 1763576], "25": ["vertical_and_slash", 30, 800, 3727796], "26": ["vertical_and_slash", 30, 800, 2744406], "27": ["vertical_and_slash", 30, 800, 1997757], "28": ["vertical_and_slash", 1000, 6096, 12257], "29": ["vertical_and_slash", 1000, 6096, 1169443], "30": ["vertical_and_slash", 3500, 100, 5723144], "31": ["vertical_and_slash", 3500, 100, 5420298]}, {"0": ["vertical_and_slash", 1000, 6096, 2447512], "1": ["vertical_and_slash", 3500, 100, 10860908], "2": ["vertical_and_slash", 100, 800, 9108572], "3": ["vertical_and_slash", 3500, 100, 11624453], "4": ["vertical_and_slash", 100, 800, 6925192], "5": ["vertical_and_slash", 100, 800, 9369879], "6": ["vertical_and_slash", 3500, 100, 11865786], "7": ["vertical_and_slash", 30, 800, 9628595], "8": ["vertical_and_slash", 1000, 6096, 6302171], "9": ["vertical_and_slash", 3500, 100, 8455497], "10": ["vertical_and_slash", 30, 800, 6885122], "11": ["vertical_and_slash", 1000, 6096, 5076785], "12": ["vertical_and_slash", 1000, 6096, 12769698], "13": ["vertical_and_slash", 1000, 6096, 13513363], "14": ["vertical_and_slash", 1000, 6096, 14089388], "15": ["vertical_and_slash", 1000, 6096, 14501815], "16": ["vertical_and_slash", 1000, 6096, 1619566], "17": ["vertical_and_slash", 1000, 6096, 5031895], "18": ["vertical_and_slash", 1000, 6096, 3833561], "19": ["vertical_and_slash", 100, 800, 12325460], "20": ["vertical_and_slash", 1000, 6096, 320906], "21": ["vertical_and_slash", 3500, 100, 13924855], "22": ["vertical_and_slash", 100, 800, 10478874], "23": ["vertical_and_slash", 30, 800, 4410655], "24": ["vertical_and_slash", 3500, 100, 14767197], "25": ["vertical_and_slash", 1000, 6096, 4108672], "26": ["vertical_and_slash", 100, 800, 14797906], "27": ["vertical_and_slash", 3500, 100, 14643144], "28": ["vertical_and_slash", 100, 800, 10556268], "29": ["vertical_and_slash", 3500, 100, 14575250], "30": ["vertical_and_slash", 1000, 6096, 14076831], "31": ["vertical_and_slash", 1000, 6096, 10779010]}, {"0": ["vertical_and_slash", 30, 800, 4744885], "1": ["vertical_and_slash", 30, 800, 4794511], "2": ["vertical_and_slash", 30, 800, 9418373], "3": ["vertical_and_slash", 30, 800, 2291979], "4": ["vertical_and_slash", 30, 800, 10009392], "5": ["vertical_and_slash", 30, 800, 981769], "6": ["vertical_and_slash", 30, 800, 3395467], "7": ["vertical_and_slash", 100, 800, 5966942], "8": ["vertical_and_slash", 30, 800, 7092993], "9": ["vertical_and_slash", 30, 800, 2176489], "10": ["vertical_and_slash", 30, 800, 4330010], "11": ["vertical_and_slash", 1000, 6096, 2664159], "12": ["vertical_and_slash", 30, 800, 7282328], "13": ["vertical_and_slash", 30, 800, 14135136], "14": ["vertical_and_slash", 1000, 6096, 791118], "15": ["vertical_and_slash", 30, 800, 9266081], "16": ["vertical_and_slash", 3500, 100, 14422288], "17": ["vertical_and_slash", 3500, 100, 11457529], "18": ["vertical_and_slash", 30, 800, 4503306], "19": ["vertical_and_slash", 100, 800, 11937543], "20": ["vertical_and_slash", 3500, 100, 14538141], "21": ["vertical_and_slash", 3500, 100, 13564714], "22": ["vertical_and_slash", 100, 800, 9671640], "23": ["vertical_and_slash", 30, 800, 2841456], "24": ["vertical_and_slash", 30, 800, 1395156], "25": ["vertical_and_slash", 30, 800, 989026], "26": ["vertical_and_slash", 30, 800, 10617339], "27": ["vertical_and_slash", 30, 800, 8170836], "28": ["vertical_and_slash", 100, 800, 2032096], "29": ["vertical_and_slash", 3500, 100, 13931334], "30": ["vertical_and_slash", 3500, 100, 14790424], "31": ["vertical_and_slash", 1000, 6096, 4133248]}]

minference/configs/Phi_3_mini_128k_instruct_kv_out_v32_fit_o_best_pattern.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"0": ["vertical_and_slash", 1000, 6096, 0.33349305391311646], "1": ["vertical_and_slash", 1000, 6096, 0.4378805160522461], "2": ["vertical_and_slash", 1000, 6096, 0.48282963037490845], "3": ["vertical_and_slash", 1000, 6096, 0.37695789337158203], "4": ["vertical_and_slash", 1000, 6096, 0.38924556970596313], "5": ["vertical_and_slash", 1000, 6096, 0.3510749340057373], "6": ["vertical_and_slash", 1000, 6096, 0.39886632561683655], "7": ["vertical_and_slash", 1000, 6096, 0.8939290046691895], "8": ["vertical_and_slash", 1000, 6096, 0.44007450342178345], "9": ["vertical_and_slash", 1000, 6096, 0.3897586464881897], "10": ["vertical_and_slash", 1000, 6096, 0.40355661511421204], "11": ["vertical_and_slash", 1000, 6096, 0.36381030082702637], "12": ["vertical_and_slash", 1000, 6096, 0.4459313154220581], "13": ["vertical_and_slash", 1000, 6096, 0.3341565728187561], "14": ["vertical_and_slash", 1000, 6096, 0.384276419878006], "15": ["vertical_and_slash", 1000, 6096, 0.34818336367607117], "16": ["vertical_and_slash", 1000, 6096, 0.3867861330509186], "17": ["vertical_and_slash", 1000, 6096, 0.3639705777168274], "18": ["vertical_and_slash", 1000, 6096, 0.3512721359729767], "19": ["vertical_and_slash", 1000, 6096, 0.4681489169597626], "20": ["vertical_and_slash", 1000, 6096, 0.4651115834712982], "21": ["vertical_and_slash", 1000, 6096, 0.3882596790790558], "22": ["vertical_and_slash", 1000, 6096, 0.47017091512680054], "23": ["vertical_and_slash", 1000, 6096, 0.8037586808204651], "24": ["vertical_and_slash", 1000, 6096, 0.3913174867630005], "25": ["vertical_and_slash", 1000, 6096, 0.5203016400337219], "26": ["vertical_and_slash", 1000, 6096, 0.47166702151298523], "27": ["vertical_and_slash", 1000, 6096, 0.760438084602356], "28": ["vertical_and_slash", 1000, 6096, 0.943070650100708], "29": ["vertical_and_slash", 1000, 6096, 0.4118039011955261], "30": ["vertical_and_slash", 1000, 6096, 0.6815055012702942], "31": ["vertical_and_slash", 1000, 6096, 0.6300445795059204]}, {"0": ["vertical_and_slash", 1000, 6096, 0.6439709663391113], "1": ["vertical_and_slash", 1000, 6096, 0.5207313895225525], "2": ["vertical_and_slash", 1000, 6096, 0.47401225566864014], "3": ["vertical_and_slash", 1000, 6096, 0.5988013744354248], "4": ["vertical_and_slash", 1000, 6096, 0.6021823287010193], "5": ["vertical_and_slash", 1000, 6096, 0.4162128269672394], "6": ["vertical_and_slash", 1000, 6096, 0.7858797311782837], "7": ["vertical_and_slash", 1000, 6096, 0.6350969672203064], "8": ["vertical_and_slash", 1000, 6096, 0.5817031860351562], "9": ["vertical_and_slash", 1000, 6096, 0.9291586875915527], "10": ["vertical_and_slash", 1000, 6096, 0.6078806519508362], "11": ["vertical_and_slash", 1000, 6096, 0.5813876986503601], "12": ["vertical_and_slash", 1000, 6096, 0.7652914524078369], "13": ["vertical_and_slash", 1000, 6096, 0.4502100944519043], "14": ["vertical_and_slash", 1000, 6096, 0.6180105209350586], "15": ["vertical_and_slash", 1000, 6096, 0.7175759673118591], "16": ["vertical_and_slash", 1000, 6096, 0.6323421597480774], "17": ["vertical_and_slash", 3500, 100, 0.479082852602005], "18": ["vertical_and_slash", 1000, 6096, 0.6011233329772949], "19": ["vertical_and_slash", 1000, 6096, 0.8908118605613708], "20": ["vertical_and_slash", 1000, 6096, 0.9255861639976501], "21": ["vertical_and_slash", 1000, 6096, 0.795491099357605], "22": ["vertical_and_slash", 1000, 6096, 0.5210989117622375], "23": ["vertical_and_slash", 1000, 6096, 0.5200297236442566], "24": ["vertical_and_slash", 1000, 6096, 0.5280771255493164], "25": ["vertical_and_slash", 1000, 6096, 0.7380014657974243], "26": ["vertical_and_slash", 1000, 6096, 0.9885807633399963], "27": ["vertical_and_slash", 30, 800, 0.8718840479850769], "28": ["vertical_and_slash", 1000, 6096, 0.6302862167358398], "29": ["vertical_and_slash", 1000, 6096, 0.5750876069068909], "30": ["vertical_and_slash", 1000, 6096, 0.45260417461395264], "31": ["vertical_and_slash", 1000, 6096, 0.6499432325363159]}, {"0": ["vertical_and_slash", 1000, 6096, 0.7977765798568726], "1": ["vertical_and_slash", 1000, 6096, 0.8083621859550476], "2": ["vertical_and_slash", 1000, 6096, 0.5935484170913696], "3": ["vertical_and_slash", 1000, 6096, 0.5435713529586792], "4": ["vertical_and_slash", 1000, 6096, 0.5687218904495239], "5": ["vertical_and_slash", 1000, 6096, 0.854501485824585], "6": ["vertical_and_slash", 1000, 6096, 0.6359673142433167], "7": ["vertical_and_slash", 1000, 6096, 0.5785433053970337], "8": ["vertical_and_slash", 1000, 6096, 0.8543683290481567], "9": ["vertical_and_slash", 1000, 6096, 0.762371838092804], "10": ["vertical_and_slash", 1000, 6096, 0.6970657706260681], "11": ["vertical_and_slash", 1000, 6096, 0.6844046115875244], "12": ["vertical_and_slash", 1000, 6096, 0.7364732623100281], "13": ["vertical_and_slash", 1000, 6096, 0.8335257172584534], "14": ["vertical_and_slash", 1000, 6096, 0.7734203934669495], "15": ["vertical_and_slash", 1000, 6096, 0.7341973185539246], "16": ["vertical_and_slash", 1000, 6096, 0.7554108500480652], "17": ["vertical_and_slash", 1000, 6096, 0.9054623246192932], "18": ["vertical_and_slash", 1000, 6096, 0.6300320029258728], "19": ["vertical_and_slash", 1000, 6096, 0.70512455701828], "20": ["vertical_and_slash", 1000, 6096, 0.6085258722305298], "21": ["vertical_and_slash", 1000, 6096, 0.6398192644119263], "22": ["vertical_and_slash", 1000, 6096, 0.5992570519447327], "23": ["vertical_and_slash", 1000, 6096, 0.7130728363990784], "24": ["vertical_and_slash", 1000, 6096, 0.8504863977432251], "25": ["vertical_and_slash", 1000, 6096, 0.5748745799064636], "26": ["vertical_and_slash", 1000, 6096, 0.7758736610412598], "27": ["vertical_and_slash", 1000, 6096, 0.5538337230682373], "28": ["vertical_and_slash", 1000, 6096, 0.7384650707244873], "29": ["vertical_and_slash", 1000, 6096, 0.6905707120895386], "30": ["vertical_and_slash", 1000, 6096, 0.6217074990272522], "31": ["vertical_and_slash", 1000, 6096, 0.9545422196388245]}, {"0": ["vertical_and_slash", 500, 700, 0.9924208521842957], "1": ["vertical_and_slash", 100, 750, 0.9987075924873352], "2": ["vertical_and_slash", 500, 700, 0.9915499687194824], "3": ["vertical_and_slash", 100, 750, 0.9940086007118225], "4": ["vertical_and_slash", 100, 750, 0.9947375655174255], "5": ["vertical_and_slash", 100, 750, 0.9920898675918579], "6": ["vertical_and_slash", 100, 750, 0.9960256218910217], "7": ["vertical_and_slash", 100, 750, 0.995691180229187], "8": ["vertical_and_slash", 100, 750, 0.9113738536834717], "9": ["vertical_and_slash", 100, 750, 0.9700976014137268], "10": ["vertical_and_slash", 3500, 100, 0.9520721435546875], "11": ["vertical_and_slash", 100, 750, 0.9561598300933838], "12": ["vertical_and_slash", 100, 750, 0.8256366848945618], "13": ["vertical_and_slash", 100, 750, 0.9905430674552917], "14": ["vertical_and_slash", 500, 700, 0.9822967648506165], "15": ["vertical_and_slash", 100, 750, 0.9880149960517883], "16": ["vertical_and_slash", 100, 750, 0.9570814967155457], "17": ["vertical_and_slash", 100, 750, 0.9678364396095276], "18": ["vertical_and_slash", 3500, 100, 0.9819864630699158], "19": ["vertical_and_slash", 100, 750, 0.9930639266967773], "20": ["vertical_and_slash", 3500, 100, 0.9928342700004578], "21": ["vertical_and_slash", 3500, 100, 0.9522428512573242], "22": ["vertical_and_slash", 100, 750, 0.9961853623390198], "23": ["vertical_and_slash", 100, 750, 0.9895046353340149], "24": ["vertical_and_slash", 100, 750, 0.9106875061988831], "25": ["vertical_and_slash", 100, 750, 0.9944272041320801], "26": ["vertical_and_slash", 100, 750, 0.9603897333145142], "27": ["vertical_and_slash", 100, 750, 0.9967218637466431], "28": ["vertical_and_slash", 100, 750, 0.9922856092453003], "29": ["vertical_and_slash", 100, 750, 0.9425711631774902], "30": ["vertical_and_slash", 1000, 6096, 0.6492345333099365], "31": ["vertical_and_slash", 500, 700, 0.957703709602356]}, {"0": ["vertical_and_slash", 100, 750, 0.9920511841773987], "1": ["vertical_and_slash", 3500, 100, 0.9784621000289917], "2": ["vertical_and_slash", 100, 750, 0.9945407509803772], "3": ["vertical_and_slash", 100, 750, 0.9613493084907532], "4": ["vertical_and_slash", 100, 750, 0.8482271432876587], "5": ["vertical_and_slash", 500, 700, 0.9943300485610962], "6": ["vertical_and_slash", 100, 750, 0.9810841083526611], "7": ["vertical_and_slash", 3500, 100, 0.9297769069671631], "8": ["vertical_and_slash", 100, 750, 0.8839191198348999], "9": ["vertical_and_slash", 100, 750, 0.9955653548240662], "10": ["vertical_and_slash", 100, 750, 0.9484658241271973], "11": ["vertical_and_slash", 100, 750, 0.994473397731781], "12": ["vertical_and_slash", 500, 700, 0.9420907497406006], "13": ["vertical_and_slash", 100, 750, 0.9161052107810974], "14": ["vertical_and_slash", 100, 750, 0.9645522832870483], "15": ["vertical_and_slash", 100, 750, 0.9875764846801758], "16": ["vertical_and_slash", 100, 750, 0.7891636490821838], "17": ["vertical_and_slash", 1000, 6096, 0.7788199186325073], "18": ["vertical_and_slash", 100, 750, 0.9488416910171509], "19": ["vertical_and_slash", 3500, 100, 0.9959850311279297], "20": ["vertical_and_slash", 100, 750, 0.9768155217170715], "21": ["vertical_and_slash", 100, 750, 0.995807945728302], "22": ["vertical_and_slash", 3500, 100, 0.8900895118713379], "23": ["vertical_and_slash", 100, 750, 0.9586788415908813], "24": ["vertical_and_slash", 100, 750, 0.9651024341583252], "25": ["vertical_and_slash", 3500, 100, 0.9384130239486694], "26": ["vertical_and_slash", 100, 750, 0.9855350255966187], "27": ["vertical_and_slash", 100, 750, 0.9657205939292908], "28": ["vertical_and_slash", 3500, 100, 0.9184022545814514], "29": ["vertical_and_slash", 100, 750, 0.866909384727478], "30": ["vertical_and_slash", 1000, 6096, 0.7826077342033386], "31": ["vertical_and_slash", 100, 750, 0.9975974559783936]}, {"0": ["vertical_and_slash", 100, 750, 0.9865456223487854], "1": ["vertical_and_slash", 100, 750, 0.9591361880302429], "2": ["vertical_and_slash", 100, 750, 0.9168012142181396], "3": ["vertical_and_slash", 500, 700, 0.9530511498451233], "4": ["vertical_and_slash", 1000, 6096, 0.8645423650741577], "5": ["vertical_and_slash", 500, 700, 0.9792267084121704], "6": ["vertical_and_slash", 100, 750, 0.9941954612731934], "7": ["vertical_and_slash", 100, 750, 0.960307776927948], "8": ["vertical_and_slash", 3500, 100, 0.9855586886405945], "9": ["vertical_and_slash", 100, 750, 0.9828901886940002], "10": ["vertical_and_slash", 100, 750, 0.8591288328170776], "11": ["vertical_and_slash", 100, 750, 0.917044460773468], "12": ["vertical_and_slash", 100, 750, 0.9849950075149536], "13": ["vertical_and_slash", 100, 750, 0.8859434723854065], "14": ["vertical_and_slash", 100, 750, 0.9971017241477966], "15": ["vertical_and_slash", 500, 700, 0.9620269536972046], "16": ["vertical_and_slash", 500, 700, 0.9597799181938171], "17": ["vertical_and_slash", 500, 700, 0.9934410452842712], "18": ["vertical_and_slash", 3500, 100, 0.9977172017097473], "19": ["vertical_and_slash", 500, 700, 0.9520473480224609], "20": ["vertical_and_slash", 3500, 100, 0.9906032085418701], "21": ["vertical_and_slash", 100, 750, 0.9745447635650635], "22": ["vertical_and_slash", 100, 750, 0.9957244396209717], "23": ["vertical_and_slash", 100, 750, 0.9829675555229187], "24": ["vertical_and_slash", 100, 750, 0.9565562009811401], "25": ["vertical_and_slash", 100, 750, 0.9823064804077148], "26": ["vertical_and_slash", 100, 750, 0.987698495388031], "27": ["vertical_and_slash", 1000, 6096, 0.8219541907310486], "28": ["vertical_and_slash", 1000, 6096, 0.7586351633071899], "29": ["vertical_and_slash", 100, 750, 0.9752539992332458], "30": ["vertical_and_slash", 100, 750, 0.9929803609848022], "31": ["vertical_and_slash", 100, 750, 0.9185792803764343]}, {"0": ["vertical_and_slash", 100, 750, 0.9146243333816528], "1": ["vertical_and_slash", 100, 750, 0.9178520441055298], "2": ["vertical_and_slash", 3500, 100, 0.9930599331855774], "3": ["vertical_and_slash", 100, 750, 0.9993709325790405], "4": ["vertical_and_slash", 500, 700, 0.9853806495666504], "5": ["vertical_and_slash", 100, 750, 0.9141497015953064], "6": ["vertical_and_slash", 100, 750, 0.992788553237915], "7": ["vertical_and_slash", 100, 750, 0.9772038459777832], "8": ["vertical_and_slash", 1000, 6096, 0.6869983673095703], "9": ["vertical_and_slash", 100, 750, 0.9871460795402527], "10": ["vertical_and_slash", 100, 750, 0.9741801619529724], "11": ["vertical_and_slash", 100, 750, 0.9956739544868469], "12": ["vertical_and_slash", 100, 750, 0.9555794596672058], "13": ["vertical_and_slash", 3500, 100, 0.8615856766700745], "14": ["vertical_and_slash", 3500, 100, 0.9012727737426758], "15": ["vertical_and_slash", 100, 750, 0.9786412715911865], "16": ["vertical_and_slash", 3500, 100, 0.7491975426673889], "17": ["vertical_and_slash", 100, 750, 0.9849361181259155], "18": ["vertical_and_slash", 3500, 100, 0.9097980856895447], "19": ["vertical_and_slash", 1000, 6096, 0.8621278405189514], "20": ["vertical_and_slash", 500, 700, 0.9943590760231018], "21": ["vertical_and_slash", 100, 750, 0.8645753264427185], "22": ["vertical_and_slash", 100, 750, 0.9920986294746399], "23": ["vertical_and_slash", 1000, 6096, 0.8657084703445435], "24": ["vertical_and_slash", 3500, 100, 0.9750965237617493], "25": ["vertical_and_slash", 3500, 100, 0.8507974147796631], "26": ["vertical_and_slash", 3500, 100, 0.9118348360061646], "27": ["vertical_and_slash", 3500, 100, 0.9703859090805054], "28": ["vertical_and_slash", 3500, 100, 0.9725451469421387], "29": ["vertical_and_slash", 1000, 6096, 0.7008982300758362], "30": ["vertical_and_slash", 1000, 6096, 0.838621199131012], "31": ["vertical_and_slash", 100, 750, 0.9929103255271912]}, {"0": ["vertical_and_slash", 1000, 6096, 0.7402030825614929], "1": ["vertical_and_slash", 1000, 6096, 0.8565414547920227], "2": ["vertical_and_slash", 100, 750, 0.9612839221954346], "3": ["vertical_and_slash", 1000, 6096, 0.9598837494850159], "4": ["vertical_and_slash", 1000, 6096, 0.7645464539527893], "5": ["vertical_and_slash", 100, 750, 0.9872377514839172], "6": ["vertical_and_slash", 1000, 6096, 0.7918620705604553], "7": ["vertical_and_slash", 500, 700, 0.9622856378555298], "8": ["vertical_and_slash", 100, 750, 0.8891160488128662], "9": ["vertical_and_slash", 500, 700, 0.9844319224357605], "10": ["vertical_and_slash", 500, 700, 0.9876360297203064], "11": ["vertical_and_slash", 500, 700, 0.9688720703125], "12": ["vertical_and_slash", 1000, 6096, 0.5671995878219604], "13": ["vertical_and_slash", 100, 750, 0.9620596170425415], "14": ["vertical_and_slash", 1000, 6096, 0.6478529572486877], "15": ["vertical_and_slash", 100, 750, 0.9807542562484741], "16": ["vertical_and_slash", 3500, 100, 0.9823787212371826], "17": ["vertical_and_slash", 100, 750, 0.8980384469032288], "18": ["vertical_and_slash", 1000, 6096, 0.8713955879211426], "19": ["vertical_and_slash", 100, 750, 0.9611169099807739], "20": ["vertical_and_slash", 100, 750, 0.9941024780273438], "21": ["vertical_and_slash", 100, 750, 0.9876882433891296], "22": ["vertical_and_slash", 3500, 100, 0.9474965333938599], "23": ["vertical_and_slash", 100, 750, 0.9415712952613831], "24": ["vertical_and_slash", 100, 750, 0.9960836172103882], "25": ["vertical_and_slash", 100, 750, 0.9898598194122314], "26": ["vertical_and_slash", 100, 750, 0.9720168113708496], "27": ["vertical_and_slash", 100, 750, 0.985356330871582], "28": ["vertical_and_slash", 3500, 100, 0.9795358180999756], "29": ["vertical_and_slash", 100, 750, 0.970496654510498], "30": ["vertical_and_slash", 3500, 100, 0.999195396900177], "31": ["vertical_and_slash", 100, 750, 0.9589951038360596]}, {"0": ["vertical_and_slash", 1000, 6096, 0.8079184889793396], "1": ["stream_llm", 100, 800, 0.96484375], "2": ["vertical_and_slash", 1000, 6096, 0.6607644557952881], "3": ["vertical_and_slash", 30, 800, 0.9899947047233582], "4": ["vertical_and_slash", 1000, 6096, 0.9565256237983704], "5": ["vertical_and_slash", 1000, 6096, 0.9755614995956421], "6": ["vertical_and_slash", 30, 800, 0.9720635414123535], "7": ["vertical_and_slash", 30, 800, 0.9191414713859558], "8": ["stream_llm", 100, 800, 0.9921875], "9": ["vertical_and_slash", 1000, 6096, 0.6984944939613342], "10": ["stream_llm", 100, 800, 0.97265625], "11": ["vertical_and_slash", 30, 800, 0.955635666847229], "12": ["vertical_and_slash", 1000, 6096, 0.9949175715446472], "13": ["vertical_and_slash", 30, 800, 0.9833577871322632], "14": ["vertical_and_slash", 1000, 6096, 0.612384021282196], "15": ["vertical_and_slash", 1000, 6096, 0.9294421076774597], "16": ["vertical_and_slash", 30, 800, 0.9978874921798706], "17": ["vertical_and_slash", 30, 800, 0.9265275001525879], "18": ["vertical_and_slash", 500, 700, 0.8441793322563171], "19": ["vertical_and_slash", 1000, 6096, 0.9973151087760925], "20": ["vertical_and_slash", 30, 800, 0.8883945941925049], "21": ["vertical_and_slash", 1000, 6096, 0.9890816807746887], "22": ["vertical_and_slash", 30, 800, 0.9924365282058716], "23": ["stream_llm", 100, 800, 0.98828125], "24": ["vertical_and_slash", 1000, 6096, 0.9733841419219971], "25": ["vertical_and_slash", 1000, 6096, 0.8846827149391174], "26": ["vertical_and_slash", 1000, 6096, 0.8909521698951721], "27": ["vertical_and_slash", 30, 800, 0.95379239320755], "28": ["vertical_and_slash", 30, 800, 0.989055871963501], "29": ["vertical_and_slash", 30, 800, 0.9804853796958923], "30": ["vertical_and_slash", 30, 800, 0.9921841621398926], "31": ["vertical_and_slash", 30, 800, 0.9727922677993774]}, {"0": ["stream_llm", 100, 800, 0.984375], "1": ["vertical_and_slash", 30, 800, 0.9801875352859497], "2": ["vertical_and_slash", 3500, 100, 0.9504685997962952], "3": ["vertical_and_slash", 500, 700, 0.5719053745269775], "4": ["vertical_and_slash", 30, 800, 0.9975548386573792], "5": ["vertical_and_slash", 30, 800, 0.9834421873092651], "6": ["vertical_and_slash", 500, 700, 0.876423180103302], "7": ["vertical_and_slash", 1000, 6096, 0.9761123657226562], "8": ["vertical_and_slash", 1000, 6096, 0.6793014407157898], "9": ["vertical_and_slash", 30, 800, 0.8573703765869141], "10": ["vertical_and_slash", 500, 700, 0.9037665128707886], "11": ["stream_llm", 100, 800, 0.94921875], "12": ["stream_llm", 100, 800, 0.59375], "13": ["vertical_and_slash", 30, 800, 0.9938877820968628], "14": ["vertical_and_slash", 30, 800, 0.9964749217033386], "15": ["stream_llm", 100, 800, 0.9765625], "16": ["vertical_and_slash", 500, 700, 0.9928801655769348], "17": ["stream_llm", 100, 800, 0.859375], "18": ["stream_llm", 100, 800, 0.93359375], "19": ["vertical_and_slash", 500, 700, 0.9897311329841614], "20": ["stream_llm", 100, 800, 0.96875], "21": ["stream_llm", 100, 800, 0.9296875], "22": ["vertical_and_slash", 1000, 6096, 0.49674782156944275], "23": ["vertical_and_slash", 1000, 6096, 0.5498730540275574], "24": ["vertical_and_slash", 1000, 6096, 0.6677294373512268], "25": ["vertical_and_slash", 30, 800, 0.8520674109458923], "26": ["vertical_and_slash", 30, 800, 0.9708148241043091], "27": ["vertical_and_slash", 1000, 6096, 0.9498739838600159], "28": ["vertical_and_slash", 30, 800, 0.9852201342582703], "29": ["vertical_and_slash", 30, 800, 0.9892252683639526], "30": ["vertical_and_slash", 30, 800, 0.9976245164871216], "31": ["stream_llm", 100, 800, 0.91796875]}, {"0": ["vertical_and_slash", 30, 800, 0.976232647895813], "1": ["vertical_and_slash", 1000, 6096, 0.850098729133606], "2": ["vertical_and_slash", 30, 800, 0.9943907260894775], "3": ["stream_llm", 100, 800, 0.984375], "4": ["vertical_and_slash", 1000, 6096, 0.9408355355262756], "5": ["stream_llm", 100, 800, 0.62109375], "6": ["vertical_and_slash", 30, 800, 0.9146958589553833], "7": ["stream_llm", 100, 800, 0.578125], "8": ["vertical_and_slash", 1000, 6096, 0.9866257905960083], "9": ["stream_llm", 100, 800, 0.8671875], "10": ["stream_llm", 100, 800, 0.98828125], "11": ["stream_llm", 100, 800, 0.80078125], "12": ["vertical_and_slash", 30, 800, 0.9795709252357483], "13": ["vertical_and_slash", 1000, 6096, 0.9181753396987915], "14": ["vertical_and_slash", 30, 800, 0.9088999032974243], "15": ["stream_llm", 100, 800, 1.0], "16": ["stream_llm", 100, 800, 0.93359375], "17": ["vertical_and_slash", 1000, 6096, 0.7872908115386963], "18": ["stream_llm", 100, 800, 0.96875], "19": ["vertical_and_slash", 30, 800, 0.9915726184844971], "20": ["vertical_and_slash", 30, 800, 0.9914611577987671], "21": ["stream_llm", 100, 800, 0.94921875], "22": ["stream_llm", 100, 800, 0.91796875], "23": ["vertical_and_slash", 3500, 100, 0.4178726077079773], "24": ["vertical_and_slash", 1000, 6096, 0.9209551811218262], "25": ["stream_llm", 100, 800, 0.953125], "26": ["vertical_and_slash", 1000, 6096, 0.8251335024833679], "27": ["vertical_and_slash", 1000, 6096, 0.7916073799133301], "28": ["stream_llm", 100, 800, 0.98046875], "29": ["vertical_and_slash", 30, 800, 0.9805914163589478], "30": ["vertical_and_slash", 30, 800, 0.9889715313911438], "31": ["vertical_and_slash", 30, 800, 0.7096468210220337]}, {"0": ["vertical_and_slash", 3500, 100, 0.9098867774009705], "1": ["vertical_and_slash", 1000, 6096, 0.9131186008453369], "2": ["vertical_and_slash", 1000, 6096, 0.6216369271278381], "3": ["vertical_and_slash", 3500, 100, 0.9781222939491272], "4": ["vertical_and_slash", 1000, 6096, 0.6995159983634949], "5": ["vertical_and_slash", 30, 800, 0.7733919620513916], "6": ["stream_llm", 100, 800, 0.8046875], "7": ["stream_llm", 100, 800, 0.9921875], "8": ["vertical_and_slash", 1000, 6096, 0.9208213686943054], "9": ["vertical_and_slash", 30, 800, 0.9892569780349731], "10": ["stream_llm", 100, 800, 0.65234375], "11": ["vertical_and_slash", 3500, 100, 0.8766616582870483], "12": ["stream_llm", 100, 800, 0.69140625], "13": ["vertical_and_slash", 30, 800, 0.9681114554405212], "14": ["vertical_and_slash", 30, 800, 0.954004168510437], "15": ["vertical_and_slash", 1000, 6096, 0.6683151721954346], "16": ["vertical_and_slash", 1000, 6096, 0.9404566287994385], "17": ["vertical_and_slash", 1000, 6096, 0.629856288433075], "18": ["vertical_and_slash", 500, 700, 0.9569997191429138], "19": ["vertical_and_slash", 1000, 6096, 0.9538705348968506], "20": ["stream_llm", 100, 800, 0.85546875], "21": ["vertical_and_slash", 1000, 6096, 0.8144884705543518], "22": ["vertical_and_slash", 30, 800, 0.95702064037323], "23": ["stream_llm", 100, 800, 0.99609375], "24": ["vertical_and_slash", 1000, 6096, 0.8552843928337097], "25": ["stream_llm", 100, 800, 0.93359375], "26": ["vertical_and_slash", 1000, 6096, 0.8885473012924194], "27": ["vertical_and_slash", 30, 800, 0.9034969210624695], "28": ["vertical_and_slash", 30, 800, 0.8834430575370789], "29": ["stream_llm", 100, 800, 0.59765625], "30": ["stream_llm", 100, 800, 0.98046875], "31": ["vertical_and_slash", 1000, 6096, 0.5801111459732056]}, {"0": ["vertical_and_slash", 1000, 6096, 0.9783773422241211], "1": ["vertical_and_slash", 1000, 6096, 0.9992927312850952], "2": ["vertical_and_slash", 30, 800, 0.9968302845954895], "3": ["vertical_and_slash", 3500, 100, 0.45828360319137573], "4": ["vertical_and_slash", 30, 800, 0.836064875125885], "5": ["vertical_and_slash", 1000, 6096, 0.8009666800498962], "6": ["vertical_and_slash", 3500, 100, 0.6518401503562927], "7": ["vertical_and_slash", 30, 800, 0.9921544790267944], "8": ["vertical_and_slash", 1000, 6096, 0.4855879545211792], "9": ["vertical_and_slash", 1000, 6096, 0.9904646277427673], "10": ["vertical_and_slash", 3500, 100, 0.8973155617713928], "11": ["vertical_and_slash", 1000, 6096, 0.8983845710754395], "12": ["stream_llm", 100, 800, 0.82421875], "13": ["vertical_and_slash", 1000, 6096, 0.8326148390769958], "14": ["vertical_and_slash", 1000, 6096, 0.44982603192329407], "15": ["vertical_and_slash", 30, 800, 0.9292823076248169], "16": ["stream_llm", 100, 800, 0.83203125], "17": ["vertical_and_slash", 500, 700, 0.8943775296211243], "18": ["vertical_and_slash", 3500, 100, 0.8824247121810913], "19": ["vertical_and_slash", 1000, 6096, 0.8916551470756531], "20": ["stream_llm", 100, 800, 0.84765625], "21": ["vertical_and_slash", 1000, 6096, 0.5656689405441284], "22": ["vertical_and_slash", 3500, 100, 0.9858580827713013], "23": ["vertical_and_slash", 3500, 100, 0.6534677743911743], "24": ["vertical_and_slash", 1000, 6096, 0.7796179056167603], "25": ["stream_llm", 100, 800, 0.984375], "26": ["stream_llm", 100, 800, 0.8125], "27": ["vertical_and_slash", 1000, 6096, 0.8051357269287109], "28": ["vertical_and_slash", 1000, 6096, 0.9759415984153748], "29": ["vertical_and_slash", 3500, 100, 0.9613996148109436], "30": ["vertical_and_slash", 30, 800, 0.9861305952072144], "31": ["vertical_and_slash", 1000, 6096, 0.5375377535820007]}, {"0": ["vertical_and_slash", 1000, 6096, 0.9526095390319824], "1": ["vertical_and_slash", 1000, 6096, 0.9219456315040588], "2": ["vertical_and_slash", 1000, 6096, 0.6329025626182556], "3": ["vertical_and_slash", 1000, 6096, 0.9703953862190247], "4": ["vertical_and_slash", 3500, 100, 0.9341285228729248], "5": ["stream_llm", 100, 800, 0.98828125], "6": ["vertical_and_slash", 3500, 100, 0.975139319896698], "7": ["vertical_and_slash", 30, 800, 0.9698626399040222], "8": ["vertical_and_slash", 1000, 6096, 0.8665440082550049], "9": ["vertical_and_slash", 1000, 6096, 0.9887139797210693], "10": ["vertical_and_slash", 1000, 6096, 0.9663894772529602], "11": ["vertical_and_slash", 500, 700, 0.9613908529281616], "12": ["vertical_and_slash", 1000, 6096, 0.9625579118728638], "13": ["vertical_and_slash", 3500, 100, 0.8293338418006897], "14": ["vertical_and_slash", 1000, 6096, 0.9918296933174133], "15": ["vertical_and_slash", 3500, 100, 0.6993081569671631], "16": ["vertical_and_slash", 1000, 6096, 0.7726790904998779], "17": ["vertical_and_slash", 30, 800, 0.9927448034286499], "18": ["vertical_and_slash", 3500, 100, 0.9216746091842651], "19": ["vertical_and_slash", 1000, 6096, 0.9197890758514404], "20": ["vertical_and_slash", 1000, 6096, 0.5418304800987244], "21": ["vertical_and_slash", 3500, 100, 0.7247577905654907], "22": ["vertical_and_slash", 1000, 6096, 0.8909022212028503], "23": ["vertical_and_slash", 3500, 100, 0.6162543892860413], "24": ["vertical_and_slash", 1000, 6096, 0.9798792600631714], "25": ["stream_llm", 100, 800, 0.9921875], "26": ["vertical_and_slash", 1000, 6096, 0.839588463306427], "27": ["stream_llm", 100, 800, 0.921875], "28": ["vertical_and_slash", 1000, 6096, 0.9863616228103638], "29": ["vertical_and_slash", 1000, 6096, 0.9895434975624084], "30": ["vertical_and_slash", 1000, 6096, 0.9338933825492859], "31": ["vertical_and_slash", 1000, 6096, 0.9152888655662537]}, {"0": ["vertical_and_slash", 100, 750, 0.7857484221458435], "1": ["vertical_and_slash", 3500, 100, 0.9863781332969666], "2": ["vertical_and_slash", 3500, 100, 0.9732434153556824], "3": ["vertical_and_slash", 1000, 6096, 0.7411113381385803], "4": ["vertical_and_slash", 1000, 6096, 0.9037321209907532], "5": ["vertical_and_slash", 1000, 6096, 0.7728227376937866], "6": ["vertical_and_slash", 3500, 100, 0.9566982388496399], "7": ["vertical_and_slash", 1000, 6096, 0.8955481648445129], "8": ["vertical_and_slash", 500, 700, 0.8905653357505798], "9": ["vertical_and_slash", 3500, 100, 0.9852890968322754], "10": ["vertical_and_slash", 1000, 6096, 0.5732011795043945], "11": ["vertical_and_slash", 3500, 100, 0.9701256155967712], "12": ["vertical_and_slash", 3500, 100, 0.8983554244041443], "13": ["vertical_and_slash", 100, 750, 0.9726784825325012], "14": ["vertical_and_slash", 3500, 100, 0.6008065938949585], "15": ["vertical_and_slash", 1000, 6096, 0.6582738161087036], "16": ["vertical_and_slash", 3500, 100, 0.9488815665245056], "17": ["vertical_and_slash", 100, 750, 0.9958171844482422], "18": ["vertical_and_slash", 3500, 100, 0.8186895847320557], "19": ["vertical_and_slash", 500, 700, 0.9635193347930908], "20": ["vertical_and_slash", 1000, 6096, 0.9248959422111511], "21": ["vertical_and_slash", 3500, 100, 0.9385164976119995], "22": ["vertical_and_slash", 100, 750, 0.9387568235397339], "23": ["vertical_and_slash", 1000, 6096, 0.8735635876655579], "24": ["vertical_and_slash", 500, 700, 0.890371561050415], "25": ["vertical_and_slash", 100, 750, 0.9905737638473511], "26": ["vertical_and_slash", 3500, 100, 0.946341335773468], "27": ["vertical_and_slash", 3500, 100, 0.942945659160614], "28": ["vertical_and_slash", 100, 750, 0.994683027267456], "29": ["vertical_and_slash", 500, 700, 0.9688966870307922], "30": ["vertical_and_slash", 1000, 6096, 0.9828435778617859], "31": ["vertical_and_slash", 1000, 6096, 0.8722150325775146]}, {"0": ["vertical_and_slash", 500, 700, 0.9728457927703857], "1": ["vertical_and_slash", 100, 750, 0.9586004018783569], "2": ["vertical_and_slash", 3500, 100, 0.9719207882881165], "3": ["vertical_and_slash", 3500, 100, 0.6680086851119995], "4": ["vertical_and_slash", 3500, 100, 0.970458984375], "5": ["vertical_and_slash", 3500, 100, 0.7634486556053162], "6": ["vertical_and_slash", 3500, 100, 0.7259127497673035], "7": ["vertical_and_slash", 100, 750, 0.9781140089035034], "8": ["vertical_and_slash", 3500, 100, 0.9952470064163208], "9": ["vertical_and_slash", 3500, 100, 0.9868772625923157], "10": ["vertical_and_slash", 3500, 100, 0.558458685874939], "11": ["vertical_and_slash", 1000, 6096, 0.7121242880821228], "12": ["vertical_and_slash", 1000, 6096, 0.7061645984649658], "13": ["vertical_and_slash", 3500, 100, 0.923751711845398], "14": ["vertical_and_slash", 1000, 6096, 0.8015576601028442], "15": ["vertical_and_slash", 500, 700, 0.9007270932197571], "16": ["vertical_and_slash", 3500, 100, 0.9591111540794373], "17": ["vertical_and_slash", 500, 700, 0.9750815033912659], "18": ["vertical_and_slash", 100, 750, 0.9805834293365479], "19": ["vertical_and_slash", 3500, 100, 0.8620939254760742], "20": ["vertical_and_slash", 3500, 100, 0.9881291389465332], "21": ["vertical_and_slash", 500, 700, 0.9975225925445557], "22": ["vertical_and_slash", 3500, 100, 0.9125117063522339], "23": ["vertical_and_slash", 3500, 100, 0.8796795010566711], "24": ["vertical_and_slash", 3500, 100, 0.9172841310501099], "25": ["vertical_and_slash", 1000, 6096, 0.8340160846710205], "26": ["vertical_and_slash", 1000, 6096, 0.8479950428009033], "27": ["vertical_and_slash", 3500, 100, 0.9778053164482117], "28": ["vertical_and_slash", 100, 750, 0.9912164211273193], "29": ["vertical_and_slash", 1000, 6096, 0.6634088754653931], "30": ["vertical_and_slash", 3500, 100, 0.9486925601959229], "31": ["vertical_and_slash", 3500, 100, 0.985546350479126]}, {"0": ["vertical_and_slash", 3500, 100, 0.7207826375961304], "1": ["vertical_and_slash", 1000, 6096, 0.7674809098243713], "2": ["vertical_and_slash", 1000, 6096, 0.5480814576148987], "3": ["vertical_and_slash", 3500, 100, 0.974454939365387], "4": ["vertical_and_slash", 100, 750, 0.9901475310325623], "5": ["vertical_and_slash", 3500, 100, 0.9111185073852539], "6": ["vertical_and_slash", 3500, 100, 0.8977652192115784], "7": ["vertical_and_slash", 500, 700, 0.8826637864112854], "8": ["vertical_and_slash", 3500, 100, 0.9674721956253052], "9": ["vertical_and_slash", 500, 700, 0.9511355757713318], "10": ["vertical_and_slash", 3500, 100, 0.9368802309036255], "11": ["vertical_and_slash", 3500, 100, 0.7037530541419983], "12": ["vertical_and_slash", 3500, 100, 0.8404982089996338], "13": ["vertical_and_slash", 3500, 100, 0.9477558732032776], "14": ["vertical_and_slash", 1000, 6096, 0.5408625602722168], "15": ["vertical_and_slash", 1000, 6096, 0.8930901288986206], "16": ["vertical_and_slash", 500, 700, 0.9620649814605713], "17": ["vertical_and_slash", 3500, 100, 0.9665637016296387], "18": ["vertical_and_slash", 3500, 100, 0.9973539710044861], "19": ["vertical_and_slash", 3500, 100, 0.9200847744941711], "20": ["vertical_and_slash", 100, 750, 0.9846996068954468], "21": ["vertical_and_slash", 3500, 100, 0.9522152543067932], "22": ["vertical_and_slash", 3500, 100, 0.9200462102890015], "23": ["vertical_and_slash", 3500, 100, 0.7189115285873413], "24": ["vertical_and_slash", 3500, 100, 0.9400286078453064], "25": ["vertical_and_slash", 3500, 100, 0.9140079617500305], "26": ["vertical_and_slash", 3500, 100, 0.9733141660690308], "27": ["vertical_and_slash", 3500, 100, 0.9182970523834229], "28": ["vertical_and_slash", 500, 700, 0.7845987677574158], "29": ["vertical_and_slash", 500, 700, 0.953305721282959], "30": ["vertical_and_slash", 1000, 6096, 0.9332642555236816], "31": ["vertical_and_slash", 500, 700, 0.8975687026977539]}, {"0": ["vertical_and_slash", 1000, 6096, 0.8796314001083374], "1": ["vertical_and_slash", 3500, 100, 0.9541191458702087], "2": ["vertical_and_slash", 3500, 100, 0.9853596091270447], "3": ["vertical_and_slash", 3500, 100, 0.9959757924079895], "4": ["vertical_and_slash", 500, 700, 0.942274272441864], "5": ["vertical_and_slash", 3500, 100, 0.9958774447441101], "6": ["vertical_and_slash", 3500, 100, 0.762219250202179], "7": ["vertical_and_slash", 3500, 100, 0.9778050780296326], "8": ["vertical_and_slash", 3500, 100, 0.9803900718688965], "9": ["vertical_and_slash", 3500, 100, 0.9493845701217651], "10": ["vertical_and_slash", 100, 750, 0.9833114147186279], "11": ["vertical_and_slash", 3500, 100, 0.9671387076377869], "12": ["vertical_and_slash", 3500, 100, 0.8459083437919617], "13": ["vertical_and_slash", 3500, 100, 0.9625062346458435], "14": ["vertical_and_slash", 3500, 100, 0.9926583766937256], "15": ["vertical_and_slash", 3500, 100, 0.9901418089866638], "16": ["vertical_and_slash", 3500, 100, 0.9975236058235168], "17": ["vertical_and_slash", 3500, 100, 0.8961046934127808], "18": ["vertical_and_slash", 3500, 100, 0.9677743315696716], "19": ["vertical_and_slash", 1000, 6096, 0.7324523329734802], "20": ["vertical_and_slash", 1000, 6096, 0.7565687298774719], "21": ["vertical_and_slash", 3500, 100, 0.9934558272361755], "22": ["vertical_and_slash", 1000, 6096, 0.695542573928833], "23": ["vertical_and_slash", 3500, 100, 0.9594518542289734], "24": ["vertical_and_slash", 3500, 100, 0.9845080375671387], "25": ["vertical_and_slash", 3500, 100, 0.9140312075614929], "26": ["vertical_and_slash", 3500, 100, 0.9816687107086182], "27": ["vertical_and_slash", 3500, 100, 0.9777555465698242], "28": ["vertical_and_slash", 3500, 100, 0.948824405670166], "29": ["vertical_and_slash", 3500, 100, 0.48502659797668457], "30": ["vertical_and_slash", 3500, 100, 0.9340038895606995], "31": ["vertical_and_slash", 3500, 100, 0.9162462949752808]}, {"0": ["vertical_and_slash", 3500, 100, 0.9923238754272461], "1": ["vertical_and_slash", 3500, 100, 0.9678853750228882], "2": ["vertical_and_slash", 100, 750, 0.9968323111534119], "3": ["vertical_and_slash", 500, 700, 0.9936473965644836], "4": ["vertical_and_slash", 3500, 100, 0.9588732123374939], "5": ["vertical_and_slash", 500, 700, 0.9791616797447205], "6": ["vertical_and_slash", 3500, 100, 0.919694721698761], "7": ["vertical_and_slash", 1000, 6096, 0.626932680606842], "8": ["vertical_and_slash", 3500, 100, 0.9546087980270386], "9": ["vertical_and_slash", 500, 700, 0.8930553793907166], "10": ["vertical_and_slash", 100, 750, 0.9767886996269226], "11": ["vertical_and_slash", 1000, 6096, 0.7312592267990112], "12": ["vertical_and_slash", 3500, 100, 0.9913722276687622], "13": ["vertical_and_slash", 3500, 100, 0.9425638914108276], "14": ["vertical_and_slash", 3500, 100, 0.9949523210525513], "15": ["vertical_and_slash", 100, 750, 0.7187187671661377], "16": ["vertical_and_slash", 3500, 100, 0.9734897017478943], "17": ["vertical_and_slash", 3500, 100, 0.9750894904136658], "18": ["vertical_and_slash", 3500, 100, 0.9543801546096802], "19": ["vertical_and_slash", 3500, 100, 0.94287109375], "20": ["vertical_and_slash", 1000, 6096, 0.7409213185310364], "21": ["vertical_and_slash", 3500, 100, 0.9789512753486633], "22": ["vertical_and_slash", 3500, 100, 0.9824472069740295], "23": ["vertical_and_slash", 3500, 100, 0.9614876508712769], "24": ["vertical_and_slash", 500, 700, 0.9097415208816528], "25": ["vertical_and_slash", 3500, 100, 0.7589483857154846], "26": ["vertical_and_slash", 3500, 100, 0.9711624979972839], "27": ["vertical_and_slash", 500, 700, 0.9924762845039368], "28": ["vertical_and_slash", 3500, 100, 0.8917614221572876], "29": ["vertical_and_slash", 500, 700, 0.9802823066711426], "30": ["vertical_and_slash", 3500, 100, 0.9433683156967163], "31": ["vertical_and_slash", 3500, 100, 0.9959222078323364]}, {"0": ["vertical_and_slash", 3500, 100, 0.8028379678726196], "1": ["vertical_and_slash", 3500, 100, 0.9934322237968445], "2": ["vertical_and_slash", 3500, 100, 0.9233330488204956], "3": ["vertical_and_slash", 500, 700, 0.9530222415924072], "4": ["vertical_and_slash", 1000, 6096, 0.7554510831832886], "5": ["vertical_and_slash", 3500, 100, 0.9931245446205139], "6": ["vertical_and_slash", 3500, 100, 0.8175129890441895], "7": ["vertical_and_slash", 500, 700, 0.9769982695579529], "8": ["vertical_and_slash", 3500, 100, 0.7803007364273071], "9": ["vertical_and_slash", 3500, 100, 0.8488234281539917], "10": ["vertical_and_slash", 1000, 6096, 0.7556964159011841], "11": ["vertical_and_slash", 100, 750, 0.9249212145805359], "12": ["vertical_and_slash", 1000, 6096, 0.5030975937843323], "13": ["vertical_and_slash", 3500, 100, 0.7736669778823853], "14": ["vertical_and_slash", 3500, 100, 0.8432313203811646], "15": ["vertical_and_slash", 3500, 100, 0.8078522086143494], "16": ["vertical_and_slash", 1000, 6096, 0.6152622699737549], "17": ["vertical_and_slash", 1000, 6096, 0.4801797866821289], "18": ["vertical_and_slash", 3500, 100, 0.7792356610298157], "19": ["vertical_and_slash", 3500, 100, 0.9260709285736084], "20": ["vertical_and_slash", 3500, 100, 0.9572370052337646], "21": ["vertical_and_slash", 500, 700, 0.9757252335548401], "22": ["vertical_and_slash", 100, 750, 0.9295142889022827], "23": ["vertical_and_slash", 100, 750, 0.8406566381454468], "24": ["vertical_and_slash", 500, 700, 0.9934183955192566], "25": ["vertical_and_slash", 3500, 100, 0.9811476469039917], "26": ["vertical_and_slash", 1000, 6096, 0.43748241662979126], "27": ["vertical_and_slash", 1000, 6096, 0.8173736929893494], "28": ["vertical_and_slash", 1000, 6096, 0.7964892983436584], "29": ["vertical_and_slash", 1000, 6096, 0.5660628080368042], "30": ["vertical_and_slash", 100, 750, 0.8858906626701355], "31": ["vertical_and_slash", 3500, 100, 0.7301779389381409]}, {"0": ["vertical_and_slash", 1000, 6096, 0.8143554925918579], "1": ["vertical_and_slash", 3500, 100, 0.8302785754203796], "2": ["vertical_and_slash", 3500, 100, 0.9859114289283752], "3": ["vertical_and_slash", 3500, 100, 0.6922958493232727], "4": ["vertical_and_slash", 3500, 100, 0.9597254991531372], "5": ["vertical_and_slash", 1000, 6096, 0.8074929714202881], "6": ["vertical_and_slash", 3500, 100, 0.7841739654541016], "7": ["vertical_and_slash", 3500, 100, 0.9443768262863159], "8": ["vertical_and_slash", 3500, 100, 0.9327424764633179], "9": ["vertical_and_slash", 3500, 100, 0.8796824812889099], "10": ["vertical_and_slash", 3500, 100, 0.9468095302581787], "11": ["vertical_and_slash", 3500, 100, 0.9797954559326172], "12": ["vertical_and_slash", 3500, 100, 0.9876496195793152], "13": ["vertical_and_slash", 100, 750, 0.9684455394744873], "14": ["vertical_and_slash", 3500, 100, 0.9720463156700134], "15": ["vertical_and_slash", 3500, 100, 0.9134085774421692], "16": ["vertical_and_slash", 100, 750, 0.9962508678436279], "17": ["vertical_and_slash", 3500, 100, 0.9967661499977112], "18": ["vertical_and_slash", 3500, 100, 0.9218150973320007], "19": ["vertical_and_slash", 3500, 100, 0.9165892601013184], "20": ["vertical_and_slash", 500, 700, 0.9811153411865234], "21": ["vertical_and_slash", 1000, 6096, 0.8401690721511841], "22": ["vertical_and_slash", 100, 750, 0.9827044606208801], "23": ["vertical_and_slash", 500, 700, 0.9265505075454712], "24": ["vertical_and_slash", 3500, 100, 0.8814885020256042], "25": ["vertical_and_slash", 1000, 6096, 0.8774723410606384], "26": ["vertical_and_slash", 1000, 6096, 0.8981026411056519], "27": ["vertical_and_slash", 100, 750, 0.995216429233551], "28": ["vertical_and_slash", 3500, 100, 0.9950628280639648], "29": ["vertical_and_slash", 500, 700, 0.9678530693054199], "30": ["vertical_and_slash", 100, 750, 0.9900303483009338], "31": ["vertical_and_slash", 3500, 100, 0.9148485064506531]}, {"0": ["vertical_and_slash", 3500, 100, 0.7734143137931824], "1": ["vertical_and_slash", 3500, 100, 0.9431662559509277], "2": ["vertical_and_slash", 100, 750, 0.9125087857246399], "3": ["vertical_and_slash", 3500, 100, 0.9382316470146179], "4": ["vertical_and_slash", 1000, 6096, 0.7059416174888611], "5": ["vertical_and_slash", 3500, 100, 0.6978054642677307], "6": ["vertical_and_slash", 3500, 100, 0.9927070140838623], "7": ["vertical_and_slash", 3500, 100, 0.9393529295921326], "8": ["vertical_and_slash", 100, 750, 0.9231113195419312], "9": ["vertical_and_slash", 3500, 100, 0.9985975623130798], "10": ["vertical_and_slash", 500, 700, 0.9555321335792542], "11": ["vertical_and_slash", 3500, 100, 0.9785676002502441], "12": ["vertical_and_slash", 500, 700, 0.9968464374542236], "13": ["vertical_and_slash", 3500, 100, 0.9894333481788635], "14": ["vertical_and_slash", 500, 700, 0.8927757740020752], "15": ["vertical_and_slash", 3500, 100, 0.9463996887207031], "16": ["vertical_and_slash", 3500, 100, 0.9756723642349243], "17": ["vertical_and_slash", 3500, 100, 0.970882773399353], "18": ["vertical_and_slash", 1000, 6096, 0.6809303164482117], "19": ["vertical_and_slash", 3500, 100, 0.9938862919807434], "20": ["vertical_and_slash", 3500, 100, 0.9821802973747253], "21": ["vertical_and_slash", 3500, 100, 0.9383650422096252], "22": ["vertical_and_slash", 3500, 100, 0.8643637299537659], "23": ["vertical_and_slash", 100, 750, 0.9771586656570435], "24": ["vertical_and_slash", 500, 700, 0.976405143737793], "25": ["vertical_and_slash", 3500, 100, 0.9743276238441467], "26": ["vertical_and_slash", 3500, 100, 0.9265220761299133], "27": ["vertical_and_slash", 3500, 100, 0.9841408729553223], "28": ["vertical_and_slash", 500, 700, 0.9391534328460693], "29": ["vertical_and_slash", 3500, 100, 0.9312986135482788], "30": ["vertical_and_slash", 3500, 100, 0.8832992911338806], "31": ["vertical_and_slash", 3500, 100, 0.9811874628067017]}, {"0": ["vertical_and_slash", 3500, 100, 0.9956807494163513], "1": ["vertical_and_slash", 3500, 100, 0.9670407772064209], "2": ["vertical_and_slash", 100, 750, 0.9973832964897156], "3": ["vertical_and_slash", 100, 750, 0.99891597032547], "4": ["vertical_and_slash", 3500, 100, 0.9931758642196655], "5": ["vertical_and_slash", 100, 750, 0.996113121509552], "6": ["vertical_and_slash", 3500, 100, 0.9983065724372864], "7": ["vertical_and_slash", 3500, 100, 0.9833848476409912], "8": ["vertical_and_slash", 3500, 100, 0.9948523640632629], "9": ["vertical_and_slash", 3500, 100, 0.8683006167411804], "10": ["vertical_and_slash", 3500, 100, 0.9931465983390808], "11": ["vertical_and_slash", 100, 750, 0.984261691570282], "12": ["vertical_and_slash", 100, 750, 0.9601353406906128], "13": ["vertical_and_slash", 500, 700, 0.9203216433525085], "14": ["vertical_and_slash", 3500, 100, 0.9650700092315674], "15": ["vertical_and_slash", 100, 750, 0.984341561794281], "16": ["vertical_and_slash", 3500, 100, 0.9989381432533264], "17": ["vertical_and_slash", 1000, 6096, 0.8591818809509277], "18": ["vertical_and_slash", 500, 700, 0.959535539150238], "19": ["vertical_and_slash", 3500, 100, 0.9685975909233093], "20": ["vertical_and_slash", 3500, 100, 0.9992274045944214], "21": ["vertical_and_slash", 3500, 100, 0.9054502248764038], "22": ["vertical_and_slash", 3500, 100, 0.9957486391067505], "23": ["vertical_and_slash", 3500, 100, 0.9970229864120483], "24": ["vertical_and_slash", 3500, 100, 0.933996319770813], "25": ["vertical_and_slash", 3500, 100, 0.9522771239280701], "26": ["vertical_and_slash", 3500, 100, 0.8640444278717041], "27": ["vertical_and_slash", 3500, 100, 0.9864702820777893], "28": ["vertical_and_slash", 1000, 6096, 0.8701584935188293], "29": ["vertical_and_slash", 3500, 100, 0.9872081279754639], "30": ["vertical_and_slash", 3500, 100, 0.9637035727500916], "31": ["vertical_and_slash", 3500, 100, 0.7964584827423096]}, {"0": ["vertical_and_slash", 500, 700, 0.944079577922821], "1": ["vertical_and_slash", 1000, 6096, 0.7686152458190918], "2": ["vertical_and_slash", 3500, 100, 0.9423201680183411], "3": ["vertical_and_slash", 3500, 100, 0.9597930908203125], "4": ["vertical_and_slash", 3500, 100, 0.9981894493103027], "5": ["vertical_and_slash", 100, 750, 0.9951789975166321], "6": ["vertical_and_slash", 3500, 100, 0.9678981304168701], "7": ["vertical_and_slash", 3500, 100, 0.8912110924720764], "8": ["vertical_and_slash", 100, 750, 0.9829361438751221], "9": ["vertical_and_slash", 500, 700, 0.9326693415641785], "10": ["vertical_and_slash", 3500, 100, 0.7954592108726501], "11": ["vertical_and_slash", 3500, 100, 0.9361847639083862], "12": ["vertical_and_slash", 3500, 100, 0.9777213335037231], "13": ["vertical_and_slash", 100, 750, 0.7402770519256592], "14": ["vertical_and_slash", 1000, 6096, 0.8369068503379822], "15": ["vertical_and_slash", 3500, 100, 0.8386251926422119], "16": ["vertical_and_slash", 500, 700, 0.9928125143051147], "17": ["vertical_and_slash", 3500, 100, 0.9980320930480957], "18": ["vertical_and_slash", 100, 750, 0.99200838804245], "19": ["vertical_and_slash", 3500, 100, 0.9937632083892822], "20": ["vertical_and_slash", 1000, 6096, 0.8582853674888611], "21": ["vertical_and_slash", 500, 700, 0.8901017308235168], "22": ["vertical_and_slash", 3500, 100, 0.9825611710548401], "23": ["vertical_and_slash", 3500, 100, 0.9956728219985962], "24": ["vertical_and_slash", 3500, 100, 0.992565929889679], "25": ["vertical_and_slash", 3500, 100, 0.9841880202293396], "26": ["vertical_and_slash", 1000, 6096, 0.8873481750488281], "27": ["vertical_and_slash", 100, 750, 0.9767672419548035], "28": ["vertical_and_slash", 3500, 100, 0.9931612610816956], "29": ["vertical_and_slash", 3500, 100, 0.9209384918212891], "30": ["vertical_and_slash", 100, 750, 0.7578334212303162], "31": ["vertical_and_slash", 3500, 100, 0.9578611850738525]}, {"0": ["vertical_and_slash", 100, 750, 0.9389412999153137], "1": ["vertical_and_slash", 100, 750, 0.9428157210350037], "2": ["vertical_and_slash", 3500, 100, 0.9956400990486145], "3": ["vertical_and_slash", 100, 750, 0.9144065976142883], "4": ["vertical_and_slash", 1000, 6096, 0.8475824594497681], "5": ["vertical_and_slash", 100, 750, 0.996335506439209], "6": ["vertical_and_slash", 3500, 100, 0.9988783597946167], "7": ["vertical_and_slash", 3500, 100, 0.94597989320755], "8": ["vertical_and_slash", 3500, 100, 0.9713111519813538], "9": ["vertical_and_slash", 100, 750, 0.9670871496200562], "10": ["vertical_and_slash", 3500, 100, 0.9996585249900818], "11": ["vertical_and_slash", 3500, 100, 0.9820530414581299], "12": ["vertical_and_slash", 3500, 100, 0.9983968138694763], "13": ["vertical_and_slash", 3500, 100, 0.9315072298049927], "14": ["vertical_and_slash", 3500, 100, 0.9930176138877869], "15": ["vertical_and_slash", 500, 700, 0.9945250749588013], "16": ["vertical_and_slash", 100, 750, 0.9049948453903198], "17": ["vertical_and_slash", 3500, 100, 0.9992651343345642], "18": ["vertical_and_slash", 500, 700, 0.9942126274108887], "19": ["vertical_and_slash", 500, 700, 0.9891477227210999], "20": ["vertical_and_slash", 3500, 100, 0.9028084874153137], "21": ["vertical_and_slash", 100, 750, 0.9475080370903015], "22": ["vertical_and_slash", 500, 700, 0.9690455794334412], "23": ["vertical_and_slash", 3500, 100, 0.9446419477462769], "24": ["vertical_and_slash", 3500, 100, 0.9801247715950012], "25": ["vertical_and_slash", 100, 750, 0.9777910113334656], "26": ["vertical_and_slash", 3500, 100, 0.7017547488212585], "27": ["vertical_and_slash", 3500, 100, 0.9493237137794495], "28": ["vertical_and_slash", 100, 750, 0.9993017315864563], "29": ["vertical_and_slash", 3500, 100, 0.893531858921051], "30": ["vertical_and_slash", 3500, 100, 0.9467594623565674], "31": ["vertical_and_slash", 3500, 100, 0.9743610620498657]}, {"0": ["vertical_and_slash", 3500, 100, 0.985114574432373], "1": ["vertical_and_slash", 500, 700, 0.9950987696647644], "2": ["vertical_and_slash", 3500, 100, 0.7027000784873962], "3": ["vertical_and_slash", 3500, 100, 0.9855831265449524], "4": ["vertical_and_slash", 3500, 100, 0.9874288439750671], "5": ["vertical_and_slash", 1000, 6096, 0.7125917673110962], "6": ["vertical_and_slash", 3500, 100, 0.9454708695411682], "7": ["vertical_and_slash", 3500, 100, 0.9898356199264526], "8": ["vertical_and_slash", 3500, 100, 0.9445544481277466], "9": ["vertical_and_slash", 3500, 100, 0.988140344619751], "10": ["vertical_and_slash", 500, 700, 0.981208860874176], "11": ["vertical_and_slash", 500, 700, 0.9874861836433411], "12": ["vertical_and_slash", 3500, 100, 0.9963038563728333], "13": ["vertical_and_slash", 100, 750, 0.9972052574157715], "14": ["vertical_and_slash", 3500, 100, 0.9943816065788269], "15": ["vertical_and_slash", 100, 750, 0.8364889025688171], "16": ["vertical_and_slash", 100, 750, 0.9870871901512146], "17": ["vertical_and_slash", 100, 750, 0.998099684715271], "18": ["vertical_and_slash", 3500, 100, 0.8674955368041992], "19": ["vertical_and_slash", 500, 700, 0.9969808459281921], "20": ["vertical_and_slash", 3500, 100, 0.8848986625671387], "21": ["vertical_and_slash", 1000, 6096, 0.867315411567688], "22": ["vertical_and_slash", 500, 700, 0.9908551573753357], "23": ["vertical_and_slash", 100, 750, 0.8952099680900574], "24": ["vertical_and_slash", 500, 700, 0.9714990854263306], "25": ["vertical_and_slash", 100, 750, 0.8733819723129272], "26": ["vertical_and_slash", 3500, 100, 0.9205271005630493], "27": ["vertical_and_slash", 3500, 100, 0.9833540916442871], "28": ["vertical_and_slash", 3500, 100, 0.9445760846138], "29": ["vertical_and_slash", 3500, 100, 0.9536135792732239], "30": ["vertical_and_slash", 500, 700, 0.9753504991531372], "31": ["vertical_and_slash", 1000, 6096, 0.8801259398460388]}, {"0": ["vertical_and_slash", 3500, 100, 0.9614631533622742], "1": ["vertical_and_slash", 3500, 100, 0.9763227105140686], "2": ["vertical_and_slash", 100, 750, 0.970956563949585], "3": ["vertical_and_slash", 100, 750, 0.9151788949966431], "4": ["vertical_and_slash", 3500, 100, 0.9920399188995361], "5": ["vertical_and_slash", 3500, 100, 0.9422896504402161], "6": ["vertical_and_slash", 3500, 100, 0.986482560634613], "7": ["vertical_and_slash", 3500, 100, 0.9976206421852112], "8": ["vertical_and_slash", 100, 750, 0.9943424463272095], "9": ["vertical_and_slash", 3500, 100, 0.9936824440956116], "10": ["vertical_and_slash", 3500, 100, 0.9882729649543762], "11": ["vertical_and_slash", 100, 750, 0.9862287640571594], "12": ["vertical_and_slash", 500, 700, 0.9886087775230408], "13": ["vertical_and_slash", 3500, 100, 0.9989089369773865], "14": ["vertical_and_slash", 3500, 100, 0.9651134610176086], "15": ["vertical_and_slash", 3500, 100, 0.9826948046684265], "16": ["vertical_and_slash", 3500, 100, 0.9450136423110962], "17": ["vertical_and_slash", 3500, 100, 0.9979375004768372], "18": ["vertical_and_slash", 3500, 100, 0.9520789384841919], "19": ["vertical_and_slash", 3500, 100, 0.9316532015800476], "20": ["vertical_and_slash", 100, 750, 0.9904720187187195], "21": ["vertical_and_slash", 3500, 100, 0.999125599861145], "22": ["vertical_and_slash", 3500, 100, 0.9995089769363403], "23": ["vertical_and_slash", 100, 750, 0.9886007308959961], "24": ["vertical_and_slash", 3500, 100, 0.9961583018302917], "25": ["vertical_and_slash", 3500, 100, 0.9961526393890381], "26": ["vertical_and_slash", 3500, 100, 0.9557645916938782], "27": ["vertical_and_slash", 3500, 100, 0.8775650262832642], "28": ["vertical_and_slash", 3500, 100, 0.986892580986023], "29": ["vertical_and_slash", 3500, 100, 0.9749740958213806], "30": ["vertical_and_slash", 3500, 100, 0.8765645027160645], "31": ["vertical_and_slash", 3500, 100, 0.9494763016700745]}, {"0": ["vertical_and_slash", 3500, 100, 0.9797922372817993], "1": ["vertical_and_slash", 3500, 100, 0.9958779811859131], "2": ["vertical_and_slash", 3500, 100, 0.9976977705955505], "3": ["vertical_and_slash", 3500, 100, 0.9764806628227234], "4": ["vertical_and_slash", 3500, 100, 0.9868356585502625], "5": ["vertical_and_slash", 1000, 6096, 0.8740545511245728], "6": ["vertical_and_slash", 3500, 100, 0.9939981698989868], "7": ["vertical_and_slash", 1000, 6096, 0.7613811492919922], "8": ["vertical_and_slash", 3500, 100, 0.9811347723007202], "9": ["vertical_and_slash", 3500, 100, 0.9840614795684814], "10": ["vertical_and_slash", 1000, 6096, 0.8657892346382141], "11": ["vertical_and_slash", 3500, 100, 0.9502456188201904], "12": ["vertical_and_slash", 100, 750, 0.9104490280151367], "13": ["vertical_and_slash", 3500, 100, 0.9950721263885498], "14": ["vertical_and_slash", 3500, 100, 0.9724959135055542], "15": ["vertical_and_slash", 1000, 6096, 0.8955191373825073], "16": ["vertical_and_slash", 3500, 100, 0.9936071038246155], "17": ["vertical_and_slash", 3500, 100, 0.9285928606987], "18": ["vertical_and_slash", 3500, 100, 0.756338357925415], "19": ["vertical_and_slash", 3500, 100, 0.9665532112121582], "20": ["vertical_and_slash", 100, 750, 0.9970663785934448], "21": ["vertical_and_slash", 3500, 100, 0.9806201457977295], "22": ["vertical_and_slash", 1000, 6096, 0.8115424513816833], "23": ["vertical_and_slash", 1000, 6096, 0.8631585836410522], "24": ["vertical_and_slash", 3500, 100, 0.9782901406288147], "25": ["vertical_and_slash", 3500, 100, 0.9858242273330688], "26": ["vertical_and_slash", 3500, 100, 0.9617720246315002], "27": ["vertical_and_slash", 3500, 100, 0.997412919998169], "28": ["vertical_and_slash", 3500, 100, 0.8432300090789795], "29": ["vertical_and_slash", 500, 700, 0.9955722093582153], "30": ["vertical_and_slash", 3500, 100, 0.9938695430755615], "31": ["vertical_and_slash", 3500, 100, 0.9511440396308899]}, {"0": ["vertical_and_slash", 3500, 100, 0.988155722618103], "1": ["vertical_and_slash", 3500, 100, 0.9747615456581116], "2": ["vertical_and_slash", 100, 750, 0.9718871712684631], "3": ["vertical_and_slash", 100, 750, 0.9756971597671509], "4": ["vertical_and_slash", 3500, 100, 0.947630763053894], "5": ["vertical_and_slash", 100, 750, 0.99262934923172], "6": ["vertical_and_slash", 3500, 100, 0.9955495595932007], "7": ["vertical_and_slash", 3500, 100, 0.8609271049499512], "8": ["vertical_and_slash", 3500, 100, 0.974815845489502], "9": ["vertical_and_slash", 3500, 100, 0.9884821772575378], "10": ["vertical_and_slash", 3500, 100, 0.9901348352432251], "11": ["vertical_and_slash", 100, 750, 0.9968274831771851], "12": ["vertical_and_slash", 3500, 100, 0.9918603897094727], "13": ["vertical_and_slash", 500, 700, 0.9757610559463501], "14": ["vertical_and_slash", 3500, 100, 0.9900703430175781], "15": ["vertical_and_slash", 500, 700, 0.9938023090362549], "16": ["vertical_and_slash", 1000, 6096, 0.8913345336914062], "17": ["vertical_and_slash", 500, 700, 0.9903258681297302], "18": ["vertical_and_slash", 100, 750, 0.9566823244094849], "19": ["vertical_and_slash", 100, 750, 0.9777167439460754], "20": ["vertical_and_slash", 3500, 100, 0.9674810767173767], "21": ["vertical_and_slash", 100, 750, 0.9178389310836792], "22": ["vertical_and_slash", 100, 750, 0.9882655143737793], "23": ["vertical_and_slash", 100, 750, 0.9989043474197388], "24": ["vertical_and_slash", 1000, 6096, 0.8574219942092896], "25": ["vertical_and_slash", 3500, 100, 0.9944363236427307], "26": ["vertical_and_slash", 3500, 100, 0.9970851540565491], "27": ["vertical_and_slash", 500, 700, 0.9904334545135498], "28": ["vertical_and_slash", 3500, 100, 0.9851230978965759], "29": ["vertical_and_slash", 3500, 100, 0.9900650978088379], "30": ["vertical_and_slash", 3500, 100, 0.9743545055389404], "31": ["vertical_and_slash", 500, 700, 0.9190711975097656]}, {"0": ["vertical_and_slash", 100, 750, 0.9716458320617676], "1": ["vertical_and_slash", 3500, 100, 0.9384027719497681], "2": ["vertical_and_slash", 3500, 100, 0.9696847796440125], "3": ["vertical_and_slash", 3500, 100, 0.9812428951263428], "4": ["vertical_and_slash", 1000, 6096, 0.5853931903839111], "5": ["vertical_and_slash", 3500, 100, 0.7994469404220581], "6": ["vertical_and_slash", 3500, 100, 0.9933062791824341], "7": ["vertical_and_slash", 3500, 100, 0.986369788646698], "8": ["vertical_and_slash", 3500, 100, 0.8895794153213501], "9": ["vertical_and_slash", 1000, 6096, 0.8238524794578552], "10": ["vertical_and_slash", 500, 700, 0.93126380443573], "11": ["vertical_and_slash", 3500, 100, 0.962100088596344], "12": ["vertical_and_slash", 3500, 100, 0.8438158631324768], "13": ["vertical_and_slash", 500, 700, 0.9969620108604431], "14": ["vertical_and_slash", 1000, 6096, 0.8904788494110107], "15": ["vertical_and_slash", 100, 750, 0.9925360679626465], "16": ["vertical_and_slash", 3500, 100, 0.9222993850708008], "17": ["vertical_and_slash", 1000, 6096, 0.6627880334854126], "18": ["vertical_and_slash", 1000, 6096, 0.8668970465660095], "19": ["vertical_and_slash", 3500, 100, 0.9340634346008301], "20": ["vertical_and_slash", 3500, 100, 0.9503065347671509], "21": ["vertical_and_slash", 3500, 100, 0.9436649680137634], "22": ["vertical_and_slash", 3500, 100, 0.9768727421760559], "23": ["vertical_and_slash", 100, 750, 0.988473653793335], "24": ["vertical_and_slash", 3500, 100, 0.8777113556861877], "25": ["vertical_and_slash", 3500, 100, 0.8750200271606445], "26": ["vertical_and_slash", 1000, 6096, 0.4957360625267029], "27": ["vertical_and_slash", 3500, 100, 0.9804278016090393], "28": ["vertical_and_slash", 1000, 6096, 0.8486401438713074], "29": ["vertical_and_slash", 3500, 100, 0.8954175114631653], "30": ["vertical_and_slash", 3500, 100, 0.9651874899864197], "31": ["vertical_and_slash", 3500, 100, 0.9620938301086426]}, {"0": ["vertical_and_slash", 100, 750, 0.920842707157135], "1": ["vertical_and_slash", 3500, 100, 0.7215947508811951], "2": ["vertical_and_slash", 3500, 100, 0.9858340620994568], "3": ["vertical_and_slash", 3500, 100, 0.7861597537994385], "4": ["vertical_and_slash", 3500, 100, 0.7639158964157104], "5": ["vertical_and_slash", 3500, 100, 0.887671947479248], "6": ["vertical_and_slash", 3500, 100, 0.8891316652297974], "7": ["vertical_and_slash", 1000, 6096, 0.8906923532485962], "8": ["vertical_and_slash", 3500, 100, 0.8836961984634399], "9": ["vertical_and_slash", 3500, 100, 0.7728190422058105], "10": ["vertical_and_slash", 3500, 100, 0.9507467746734619], "11": ["vertical_and_slash", 500, 700, 0.7829118967056274], "12": ["vertical_and_slash", 100, 750, 0.8214483857154846], "13": ["vertical_and_slash", 3500, 100, 0.7196475863456726], "14": ["vertical_and_slash", 500, 700, 0.8691932559013367], "15": ["vertical_and_slash", 1000, 6096, 0.6569814085960388], "16": ["vertical_and_slash", 100, 750, 0.9087151288986206], "17": ["vertical_and_slash", 3500, 100, 0.7609643936157227], "18": ["vertical_and_slash", 3500, 100, 0.8670530319213867], "19": ["vertical_and_slash", 1000, 6096, 0.7779831290245056], "20": ["vertical_and_slash", 100, 750, 0.923963725566864], "21": ["vertical_and_slash", 1000, 6096, 0.5714190006256104], "22": ["vertical_and_slash", 500, 700, 0.6351447105407715], "23": ["vertical_and_slash", 100, 750, 0.870464026927948], "24": ["vertical_and_slash", 1000, 6096, 0.6272542476654053], "25": ["vertical_and_slash", 1000, 6096, 0.7302500009536743], "26": ["vertical_and_slash", 3500, 100, 0.9410015940666199], "27": ["vertical_and_slash", 3500, 100, 0.793304979801178], "28": ["vertical_and_slash", 1000, 6096, 0.837500274181366], "29": ["vertical_and_slash", 1000, 6096, 0.766721248626709], "30": ["vertical_and_slash", 1000, 6096, 0.7082650065422058], "31": ["vertical_and_slash", 3500, 100, 0.8947907090187073]}, {"0": ["vertical_and_slash", 100, 750, 0.8983681797981262], "1": ["vertical_and_slash", 1000, 6096, 0.9650430083274841], "2": ["vertical_and_slash", 500, 700, 0.9532706141471863], "3": ["vertical_and_slash", 3500, 100, 0.8198072910308838], "4": ["vertical_and_slash", 1000, 6096, 0.840558648109436], "5": ["vertical_and_slash", 3500, 100, 0.8227204084396362], "6": ["vertical_and_slash", 1000, 6096, 0.5979130268096924], "7": ["vertical_and_slash", 1000, 6096, 0.7691975235939026], "8": ["vertical_and_slash", 1000, 6096, 0.8089779615402222], "9": ["vertical_and_slash", 100, 750, 0.8689324855804443], "10": ["vertical_and_slash", 100, 750, 0.8621079325675964], "11": ["vertical_and_slash", 500, 700, 0.9871177673339844], "12": ["vertical_and_slash", 1000, 6096, 0.9468575716018677], "13": ["vertical_and_slash", 100, 750, 0.9075571894645691], "14": ["vertical_and_slash", 1000, 6096, 0.911694347858429], "15": ["vertical_and_slash", 100, 750, 0.9817390441894531], "16": ["vertical_and_slash", 1000, 6096, 0.7491167783737183], "17": ["vertical_and_slash", 1000, 6096, 0.8255623579025269], "18": ["vertical_and_slash", 1000, 6096, 0.8701649308204651], "19": ["vertical_and_slash", 3500, 100, 0.838506817817688], "20": ["vertical_and_slash", 1000, 6096, 0.8749529123306274], "21": ["vertical_and_slash", 500, 700, 0.8783859610557556], "22": ["vertical_and_slash", 3500, 100, 0.9302544593811035], "23": ["vertical_and_slash", 100, 750, 0.9118035435676575], "24": ["vertical_and_slash", 1000, 6096, 0.7892093658447266], "25": ["vertical_and_slash", 100, 750, 0.904501736164093], "26": ["vertical_and_slash", 3500, 100, 0.947079598903656], "27": ["vertical_and_slash", 1000, 6096, 0.5719630718231201], "28": ["vertical_and_slash", 3500, 100, 0.9740545153617859], "29": ["vertical_and_slash", 100, 750, 0.8365178108215332], "30": ["vertical_and_slash", 3500, 100, 0.8893513083457947], "31": ["vertical_and_slash", 1000, 6096, 0.923209547996521]}]

minference/configs/Yi_9B_200k_kv_out_v32_fit_o_best_pattern.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"0": ["vertical_and_slash", 1000, 4096, 12982], "1": ["vertical_and_slash", 1000, 4096, 54], "2": ["vertical_and_slash", 1000, 4096, 0], "3": ["vertical_and_slash", 1000, 4096, 5], "4": ["vertical_and_slash", 1000, 4096, 57], "5": ["vertical_and_slash", 1000, 4096, 93], "6": ["vertical_and_slash", 1000, 4096, 5], "7": ["vertical_and_slash", 1000, 4096, 0], "8": ["vertical_and_slash", 1000, 4096, 4], "9": ["vertical_and_slash", 1000, 4096, 8], "10": ["vertical_and_slash", 1000, 4096, 10020], "11": ["vertical_and_slash", 1000, 4096, 0], "12": ["vertical_and_slash", 1000, 4096, 222290], "13": ["vertical_and_slash", 1000, 4096, 162], "14": ["vertical_and_slash", 1000, 4096, 3], "15": ["vertical_and_slash", 1000, 4096, 11], "16": ["vertical_and_slash", 1000, 4096, 10], "17": ["vertical_and_slash", 1000, 4096, 4], "18": ["vertical_and_slash", 1000, 4096, 26297], "19": ["vertical_and_slash", 1000, 4096, 3], "20": ["vertical_and_slash", 1000, 4096, 0], "21": ["vertical_and_slash", 1000, 4096, 0], "22": ["vertical_and_slash", 1000, 4096, 1627], "23": ["vertical_and_slash", 1000, 4096, 7], "24": ["vertical_and_slash", 1000, 4096, 0], "25": ["vertical_and_slash", 1000, 4096, 859], "26": ["vertical_and_slash", 1000, 4096, 0], "27": ["vertical_and_slash", 1000, 4096, 0], "28": ["vertical_and_slash", 1000, 4096, 484], "29": ["vertical_and_slash", 1000, 4096, 1239], "30": ["vertical_and_slash", 1000, 4096, 0], "31": ["vertical_and_slash", 1000, 4096, 0]}, {"0": ["vertical_and_slash", 1000, 4096, 430388], "1": ["vertical_and_slash", 1000, 4096, 299591], "2": ["vertical_and_slash", 1000, 4096, 5802], "3": ["vertical_and_slash", 1000, 4096, 22390], "4": ["vertical_and_slash", 1000, 4096, 284950], "5": ["vertical_and_slash", 1000, 4096, 237516], "6": ["vertical_and_slash", 1000, 4096, 39541], "7": ["vertical_and_slash", 1000, 4096, 46216], "8": ["vertical_and_slash", 1000, 4096, 782645], "9": ["vertical_and_slash", 1000, 4096, 8], "10": ["vertical_and_slash", 1000, 4096, 18], "11": ["vertical_and_slash", 1000, 4096, 18890], "12": ["vertical_and_slash", 1000, 4096, 141], "13": ["vertical_and_slash", 1000, 4096, 53457], "14": ["vertical_and_slash", 1000, 4096, 34], "15": ["vertical_and_slash", 1000, 4096, 0], "16": ["vertical_and_slash", 1000, 4096, 246481], "17": ["vertical_and_slash", 1000, 4096, 135148], "18": ["vertical_and_slash", 1000, 4096, 48561], "19": ["vertical_and_slash", 1000, 4096, 54785], "20": ["vertical_and_slash", 1000, 4096, 95382], "21": ["vertical_and_slash", 1000, 4096, 387], "22": ["vertical_and_slash", 1000, 4096, 1750], "23": ["vertical_and_slash", 1000, 4096, 201661], "24": ["vertical_and_slash", 1000, 4096, 51272], "25": ["vertical_and_slash", 1000, 4096, 115255], "26": ["vertical_and_slash", 1000, 4096, 6], "27": ["vertical_and_slash", 1000, 4096, 6895], "28": ["vertical_and_slash", 1000, 4096, 2335], "29": ["vertical_and_slash", 1000, 4096, 23041], "30": ["vertical_and_slash", 1000, 4096, 6270087], "31": ["vertical_and_slash", 1000, 4096, 0]}, {"0": ["vertical_and_slash", 100, 800, 11], "1": ["vertical_and_slash", 30, 800, 5], "2": ["vertical_and_slash", 30, 800, 2790], "3": ["vertical_and_slash", 30, 800, 37], "4": ["vertical_and_slash", 30, 800, 2903], "5": ["vertical_and_slash", 30, 800, 1], "6": ["vertical_and_slash", 30, 800, 101], "7": ["vertical_and_slash", 100, 800, 16677], "8": ["vertical_and_slash", 1000, 4096, 99796], "9": ["vertical_and_slash", 30, 800, 8116], "10": ["vertical_and_slash", 30, 800, 1993], "11": ["vertical_and_slash", 1000, 4096, 2561], "12": ["vertical_and_slash", 30, 800, 21], "13": ["vertical_and_slash", 30, 800, 9624], "14": ["vertical_and_slash", 1000, 4096, 3894510], "15": ["vertical_and_slash", 1000, 4096, 66775], "16": ["vertical_and_slash", 30, 800, 1569], "17": ["vertical_and_slash", 1000, 4096, 146958], "18": ["vertical_and_slash", 30, 800, 29976], "19": ["vertical_and_slash", 1000, 4096, 269566], "20": ["vertical_and_slash", 100, 800, 50639], "21": ["vertical_and_slash", 30, 800, 114641], "22": ["vertical_and_slash", 1000, 4096, 238607], "23": ["vertical_and_slash", 100, 800, 302385], "24": ["vertical_and_slash", 1000, 4096, 4893], "25": ["vertical_and_slash", 30, 800, 322], "26": ["vertical_and_slash", 1000, 4096, 3639], "27": ["vertical_and_slash", 100, 800, 131], "28": ["vertical_and_slash", 1000, 4096, 348560], "29": ["vertical_and_slash", 1000, 4096, 14611], "30": ["vertical_and_slash", 30, 800, 86], "31": ["vertical_and_slash", 1000, 4096, 900]}, {"0": ["vertical_and_slash", 100, 800, 64], "1": ["vertical_and_slash", 1000, 4096, 10], "2": ["vertical_and_slash", 500, 700, 77], "3": ["vertical_and_slash", 1000, 4096, 4193], "4": ["vertical_and_slash", 100, 800, 83525], "5": ["vertical_and_slash", 1000, 4096, 6], "6": ["vertical_and_slash", 1000, 4096, 27907], "7": ["vertical_and_slash", 1000, 4096, 42], "8": ["vertical_and_slash", 30, 800, 21349], "9": ["vertical_and_slash", 30, 800, 5018], "10": ["vertical_and_slash", 30, 800, 1663], "11": ["vertical_and_slash", 30, 800, 86902], "12": ["vertical_and_slash", 30, 800, 781], "13": ["vertical_and_slash", 100, 800, 339811], "14": ["vertical_and_slash", 100, 800, 696206], "15": ["vertical_and_slash", 30, 800, 47681], "16": ["vertical_and_slash", 30, 800, 4251], "17": ["vertical_and_slash", 1000, 4096, 6373945], "18": ["vertical_and_slash", 100, 800, 289132], "19": ["vertical_and_slash", 1000, 4096, 10273], "20": ["vertical_and_slash", 1000, 4096, 457078], "21": ["vertical_and_slash", 1000, 4096, 1372461], "22": ["vertical_and_slash", 100, 800, 11108], "23": ["vertical_and_slash", 100, 800, 2979], "24": ["vertical_and_slash", 1000, 4096, 30365], "25": ["vertical_and_slash", 500, 700, 142429], "26": ["vertical_and_slash", 500, 700, 6300], "27": ["vertical_and_slash", 30, 800, 4711], "28": ["vertical_and_slash", 500, 700, 4810], "29": ["vertical_and_slash", 500, 700, 25571], "30": ["vertical_and_slash", 500, 700, 7924], "31": ["vertical_and_slash", 500, 700, 3337]}, {"0": ["vertical_and_slash", 30, 800, 34678], "1": ["vertical_and_slash", 30, 800, 13104], "2": ["vertical_and_slash", 30, 800, 4929], "3": ["vertical_and_slash", 100, 800, 9351380], "4": ["vertical_and_slash", 100, 800, 333814], "5": ["vertical_and_slash", 100, 800, 603408], "6": ["vertical_and_slash", 30, 800, 18975], "7": ["vertical_and_slash", 30, 800, 8848], "8": ["vertical_and_slash", 100, 800, 1690132], "9": ["vertical_and_slash", 30, 800, 59610], "10": ["vertical_and_slash", 500, 700, 1234], "11": ["vertical_and_slash", 1000, 4096, 74422], "12": ["vertical_and_slash", 1000, 4096, 504212], "13": ["vertical_and_slash", 30, 800, 3100], "14": ["vertical_and_slash", 100, 800, 1160], "15": ["vertical_and_slash", 500, 700, 5784], "16": ["vertical_and_slash", 30, 800, 18695], "17": ["vertical_and_slash", 30, 800, 2090], "18": ["vertical_and_slash", 30, 800, 28562], "19": ["vertical_and_slash", 30, 800, 34339], "20": ["vertical_and_slash", 30, 800, 2544], "21": ["vertical_and_slash", 30, 800, 1914], "22": ["vertical_and_slash", 30, 800, 83258], "23": ["vertical_and_slash", 30, 800, 7898], "24": ["vertical_and_slash", 30, 800, 11609], "25": ["vertical_and_slash", 1000, 4096, 64138], "26": ["vertical_and_slash", 1000, 4096, 514471], "27": ["vertical_and_slash", 500, 700, 39930], "28": ["vertical_and_slash", 30, 800, 477456], "29": ["vertical_and_slash", 100, 800, 4526], "30": ["vertical_and_slash", 1000, 4096, 30006], "31": ["vertical_and_slash", 30, 800, 92845]}, {"0": ["vertical_and_slash", 30, 800, 55378], "1": ["vertical_and_slash", 1000, 4096, 17441], "2": ["vertical_and_slash", 100, 800, 1890658], "3": ["vertical_and_slash", 30, 800, 39922], "4": ["vertical_and_slash", 30, 800, 3841], "5": ["vertical_and_slash", 30, 800, 16402], "6": ["vertical_and_slash", 30, 800, 9274], "7": ["vertical_and_slash", 100, 800, 2756], "8": ["vertical_and_slash", 100, 800, 190896], "9": ["vertical_and_slash", 1000, 4096, 30060], "10": ["vertical_and_slash", 1000, 4096, 1123342], "11": ["vertical_and_slash", 1000, 4096, 260812], "12": ["vertical_and_slash", 1000, 4096, 4395769], "13": ["vertical_and_slash", 1000, 4096, 1803359], "14": ["vertical_and_slash", 30, 800, 17625], "15": ["vertical_and_slash", 1000, 4096, 1501177], "16": ["vertical_and_slash", 1000, 4096, 236955], "17": ["vertical_and_slash", 1000, 4096, 27239], "18": ["vertical_and_slash", 1000, 4096, 84045], "19": ["vertical_and_slash", 1000, 4096, 112395], "20": ["vertical_and_slash", 1000, 4096, 289351], "21": ["vertical_and_slash", 1000, 4096, 1200493], "22": ["vertical_and_slash", 100, 800, 5628], "23": ["vertical_and_slash", 1000, 4096, 53], "24": ["vertical_and_slash", 30, 800, 1001179], "25": ["vertical_and_slash", 1000, 4096, 1417294], "26": ["vertical_and_slash", 100, 800, 712290], "27": ["vertical_and_slash", 1000, 4096, 111462], "28": ["vertical_and_slash", 1000, 4096, 2382091], "29": ["vertical_and_slash", 30, 800, 10632], "30": ["vertical_and_slash", 100, 800, 404628], "31": ["vertical_and_slash", 1000, 4096, 36025]}, {"0": ["vertical_and_slash", 1000, 4096, 683931], "1": ["vertical_and_slash", 1000, 4096, 1978224], "2": ["vertical_and_slash", 30, 800, 529064], "3": ["vertical_and_slash", 30, 800, 20483], "4": ["vertical_and_slash", 30, 800, 226587], "5": ["vertical_and_slash", 30, 800, 100650], "6": ["vertical_and_slash", 30, 800, 88814], "7": ["vertical_and_slash", 30, 800, 25415], "8": ["vertical_and_slash", 1000, 4096, 126846], "9": ["vertical_and_slash", 100, 800, 83585], "10": ["vertical_and_slash", 1000, 4096, 53117], "11": ["vertical_and_slash", 1000, 4096, 30196], "12": ["vertical_and_slash", 1000, 4096, 81511], "13": ["vertical_and_slash", 1000, 4096, 25087], "14": ["vertical_and_slash", 1000, 4096, 52332], "15": ["vertical_and_slash", 1000, 4096, 1662596], "16": ["vertical_and_slash", 30, 800, 26199], "17": ["vertical_and_slash", 30, 800, 72420], "18": ["vertical_and_slash", 30, 800, 74770], "19": ["vertical_and_slash", 30, 800, 94064], "20": ["vertical_and_slash", 30, 800, 10369], "21": ["vertical_and_slash", 1000, 4096, 2802268], "22": ["vertical_and_slash", 30, 800, 32077], "23": ["vertical_and_slash", 500, 700, 751949], "24": ["vertical_and_slash", 100, 800, 23111], "25": ["vertical_and_slash", 100, 800, 13161], "26": ["vertical_and_slash", 100, 800, 164196], "27": ["vertical_and_slash", 1000, 4096, 12766], "28": ["vertical_and_slash", 1000, 4096, 37748], "29": ["vertical_and_slash", 1000, 4096, 394580], "30": ["vertical_and_slash", 30, 800, 1161581], "31": ["vertical_and_slash", 1000, 4096, 1070988]}, {"0": ["vertical_and_slash", 100, 800, 4619], "1": ["vertical_and_slash", 1000, 4096, 3223], "2": ["vertical_and_slash", 100, 800, 65675], "3": ["vertical_and_slash", 30, 800, 56], "4": ["vertical_and_slash", 30, 800, 93], "5": ["vertical_and_slash", 30, 800, 72], "6": ["vertical_and_slash", 500, 700, 3523], "7": ["vertical_and_slash", 1000, 4096, 12230], "8": ["vertical_and_slash", 100, 800, 9301307], "9": ["vertical_and_slash", 1000, 4096, 418350], "10": ["vertical_and_slash", 1000, 4096, 994569], "11": ["vertical_and_slash", 100, 800, 399778], "12": ["vertical_and_slash", 1000, 4096, 2677334], "13": ["vertical_and_slash", 1000, 4096, 409432], "14": ["vertical_and_slash", 30, 800, 1233050], "15": ["vertical_and_slash", 1000, 4096, 5697704], "16": ["vertical_and_slash", 100, 800, 294], "17": ["vertical_and_slash", 30, 800, 50017], "18": ["vertical_and_slash", 30, 800, 1566], "19": ["vertical_and_slash", 30, 800, 2368], "20": ["vertical_and_slash", 30, 800, 3051012], "21": ["vertical_and_slash", 1000, 4096, 15983], "22": ["vertical_and_slash", 1000, 4096, 48], "23": ["vertical_and_slash", 1000, 4096, 312543], "24": ["vertical_and_slash", 30, 800, 4820], "25": ["vertical_and_slash", 30, 800, 100931], "26": ["vertical_and_slash", 30, 800, 69743], "27": ["vertical_and_slash", 30, 800, 22187], "28": ["vertical_and_slash", 30, 800, 3936], "29": ["vertical_and_slash", 30, 800, 4611], "30": ["vertical_and_slash", 30, 800, 21928], "31": ["vertical_and_slash", 30, 800, 133206]}, {"0": ["vertical_and_slash", 100, 800, 41811], "1": ["vertical_and_slash", 30, 800, 4226], "2": ["vertical_and_slash", 100, 800, 11930], "3": ["vertical_and_slash", 30, 800, 629146], "4": ["vertical_and_slash", 100, 800, 511736], "5": ["vertical_and_slash", 100, 800, 1408], "6": ["vertical_and_slash", 30, 800, 18012], "7": ["vertical_and_slash", 30, 800, 897], "8": ["vertical_and_slash", 30, 800, 107705], "9": ["vertical_and_slash", 30, 800, 152957], "10": ["vertical_and_slash", 30, 800, 272002], "11": ["vertical_and_slash", 30, 800, 5216722], "12": ["vertical_and_slash", 30, 800, 509504], "13": ["vertical_and_slash", 30, 800, 72091], "14": ["vertical_and_slash", 30, 800, 166293], "15": ["vertical_and_slash", 30, 800, 426344], "16": ["vertical_and_slash", 30, 800, 316624], "17": ["vertical_and_slash", 1000, 4096, 158902], "18": ["vertical_and_slash", 30, 800, 162502], "19": ["vertical_and_slash", 1000, 4096, 2464314], "20": ["vertical_and_slash", 1000, 4096, 5817909], "21": ["vertical_and_slash", 100, 800, 1141235], "22": ["vertical_and_slash", 30, 800, 452577], "23": ["vertical_and_slash", 30, 800, 193960], "24": ["vertical_and_slash", 30, 800, 538157], "25": ["vertical_and_slash", 30, 800, 1355759], "26": ["vertical_and_slash", 100, 800, 141236], "27": ["vertical_and_slash", 30, 800, 87608], "28": ["vertical_and_slash", 30, 800, 102946], "29": ["vertical_and_slash", 30, 800, 81254], "30": ["vertical_and_slash", 30, 800, 6194794], "31": ["vertical_and_slash", 30, 800, 2092660]}, {"0": ["vertical_and_slash", 30, 800, 278589], "1": ["vertical_and_slash", 30, 800, 1071731], "2": ["vertical_and_slash", 30, 800, 1991650], "3": ["vertical_and_slash", 30, 800, 308703], "4": ["vertical_and_slash", 30, 800, 1024242], "5": ["vertical_and_slash", 30, 800, 3107957], "6": ["vertical_and_slash", 30, 800, 926801], "7": ["vertical_and_slash", 30, 800, 2887199], "8": ["vertical_and_slash", 1000, 4096, 4152662], "9": ["vertical_and_slash", 100, 800, 15773492], "10": ["vertical_and_slash", 30, 800, 667496], "11": ["vertical_and_slash", 30, 800, 767325], "12": ["vertical_and_slash", 30, 800, 490706], "13": ["vertical_and_slash", 100, 800, 3083166], "14": ["vertical_and_slash", 100, 800, 14433242], "15": ["vertical_and_slash", 30, 800, 514502], "16": ["vertical_and_slash", 1000, 4096, 4574900], "17": ["vertical_and_slash", 1000, 4096, 1828093], "18": ["vertical_and_slash", 30, 800, 3790483], "19": ["vertical_and_slash", 1000, 4096, 9164424], "20": ["vertical_and_slash", 1000, 4096, 1011346], "21": ["vertical_and_slash", 1000, 4096, 1768867], "22": ["vertical_and_slash", 100, 800, 3253894], "23": ["vertical_and_slash", 1000, 4096, 882663], "24": ["vertical_and_slash", 100, 800, 1974998], "25": ["vertical_and_slash", 500, 700, 1452483], "26": ["vertical_and_slash", 100, 800, 12992816], "27": ["vertical_and_slash", 1000, 4096, 4441511], "28": ["vertical_and_slash", 100, 800, 3146531], "29": ["vertical_and_slash", 1000, 4096, 7002295], "30": ["vertical_and_slash", 100, 800, 7974855], "31": ["vertical_and_slash", 1000, 4096, 2767293]}, {"0": ["vertical_and_slash", 30, 800, 517042], "1": ["vertical_and_slash", 30, 800, 9471250], "2": ["vertical_and_slash", 30, 800, 67128], "3": ["vertical_and_slash", 100, 800, 13225828], "4": ["vertical_and_slash", 1000, 4096, 8138531], "5": ["vertical_and_slash", 30, 800, 169424], "6": ["vertical_and_slash", 30, 800, 165102], "7": ["vertical_and_slash", 1000, 4096, 898000], "8": ["vertical_and_slash", 100, 800, 498306], "9": ["vertical_and_slash", 100, 800, 12016777], "10": ["vertical_and_slash", 100, 800, 13078398], "11": ["vertical_and_slash", 1000, 4096, 569449], "12": ["vertical_and_slash", 1000, 4096, 4419468], "13": ["vertical_and_slash", 100, 800, 2308923], "14": ["vertical_and_slash", 100, 800, 188999], "15": ["vertical_and_slash", 30, 800, 685736], "16": ["vertical_and_slash", 100, 800, 161819], "17": ["vertical_and_slash", 100, 800, 1878966], "18": ["vertical_and_slash", 100, 800, 7840855], "19": ["vertical_and_slash", 30, 800, 207320], "20": ["vertical_and_slash", 100, 800, 2233365], "21": ["vertical_and_slash", 100, 800, 685239], "22": ["vertical_and_slash", 1000, 4096, 1493618], "23": ["vertical_and_slash", 30, 800, 1137958], "24": ["vertical_and_slash", 30, 800, 115113], "25": ["vertical_and_slash", 30, 800, 809754], "26": ["vertical_and_slash", 30, 800, 1328591], "27": ["vertical_and_slash", 30, 800, 697970], "28": ["vertical_and_slash", 1000, 4096, 14409], "29": ["vertical_and_slash", 30, 800, 376399], "30": ["vertical_and_slash", 30, 800, 71599], "31": ["vertical_and_slash", 30, 800, 431162]}, {"0": ["vertical_and_slash", 30, 800, 7073664], "1": ["vertical_and_slash", 100, 800, 4139486], "2": ["vertical_and_slash", 30, 800, 126298], "3": ["vertical_and_slash", 30, 800, 626891], "4": ["vertical_and_slash", 1000, 4096, 244457], "5": ["vertical_and_slash", 30, 800, 338124], "6": ["vertical_and_slash", 100, 800, 4247346], "7": ["vertical_and_slash", 100, 800, 1853876], "8": ["vertical_and_slash", 1000, 4096, 6355420], "9": ["vertical_and_slash", 100, 800, 988264], "10": ["vertical_and_slash", 1000, 4096, 984583], "11": ["vertical_and_slash", 100, 800, 914211], "12": ["vertical_and_slash", 1000, 4096, 570502], "13": ["vertical_and_slash", 1000, 4096, 10187572], "14": ["vertical_and_slash", 1000, 4096, 3408578], "15": ["vertical_and_slash", 1000, 4096, 11375984], "16": ["vertical_and_slash", 100, 800, 5144098], "17": ["vertical_and_slash", 1000, 4096, 350031], "18": ["vertical_and_slash", 1000, 4096, 1299268], "19": ["vertical_and_slash", 1000, 4096, 790117], "20": ["vertical_and_slash", 100, 800, 24094], "21": ["vertical_and_slash", 30, 800, 3856442], "22": ["vertical_and_slash", 100, 800, 383726], "23": ["vertical_and_slash", 500, 700, 832], "24": ["vertical_and_slash", 100, 800, 7717427], "25": ["vertical_and_slash", 1000, 4096, 4545251], "26": ["vertical_and_slash", 30, 800, 7922478], "27": ["vertical_and_slash", 1000, 4096, 2809849], "28": ["vertical_and_slash", 1000, 4096, 4392930], "29": ["vertical_and_slash", 100, 800, 2998060], "30": ["vertical_and_slash", 100, 800, 6173903], "31": ["vertical_and_slash", 1000, 4096, 2536227]}, {"0": ["vertical_and_slash", 30, 800, 1733117], "1": ["vertical_and_slash", 100, 800, 2514524], "2": ["vertical_and_slash", 1000, 4096, 12567570], "3": ["vertical_and_slash", 1000, 4096, 2817534], "4": ["vertical_and_slash", 1000, 4096, 10571712], "5": ["vertical_and_slash", 100, 800, 1311331], "6": ["vertical_and_slash", 30, 800, 4202358], "7": ["vertical_and_slash", 30, 800, 4970102], "8": ["vertical_and_slash", 30, 800, 88687], "9": ["vertical_and_slash", 30, 800, 293880], "10": ["vertical_and_slash", 500, 700, 70693], "11": ["vertical_and_slash", 30, 800, 13849], "12": ["vertical_and_slash", 30, 800, 238706], "13": ["vertical_and_slash", 30, 800, 78435], "14": ["vertical_and_slash", 30, 800, 164251], "15": ["vertical_and_slash", 30, 800, 199789], "16": ["vertical_and_slash", 30, 800, 200684], "17": ["vertical_and_slash", 1000, 4096, 1761919], "18": ["vertical_and_slash", 30, 800, 210071], "19": ["vertical_and_slash", 30, 800, 68554], "20": ["vertical_and_slash", 30, 800, 484345], "21": ["vertical_and_slash", 30, 800, 1489873], "22": ["vertical_and_slash", 30, 800, 301028], "23": ["vertical_and_slash", 30, 800, 1124431], "24": ["vertical_and_slash", 100, 800, 636179], "25": ["vertical_and_slash", 100, 800, 611008], "26": ["vertical_and_slash", 1000, 4096, 1639], "27": ["vertical_and_slash", 1000, 4096, 8255730], "28": ["vertical_and_slash", 1000, 4096, 6678469], "29": ["vertical_and_slash", 1000, 4096, 628985], "30": ["vertical_and_slash", 1000, 4096, 348316], "31": ["vertical_and_slash", 1000, 4096, 2159698]}, {"0": ["vertical_and_slash", 100, 800, 7105558], "1": ["vertical_and_slash", 30, 800, 1085603], "2": ["vertical_and_slash", 1000, 4096, 7896209], "3": ["vertical_and_slash", 30, 800, 193488], "4": ["vertical_and_slash", 100, 800, 1467223], "5": ["vertical_and_slash", 30, 800, 13794329], "6": ["vertical_and_slash", 1000, 4096, 15661583], "7": ["vertical_and_slash", 1000, 4096, 21334871], "8": ["vertical_and_slash", 1000, 4096, 6158120], "9": ["vertical_and_slash", 1000, 4096, 7414022], "10": ["vertical_and_slash", 100, 800, 14091447], "11": ["vertical_and_slash", 1000, 4096, 15589771], "12": ["vertical_and_slash", 1000, 4096, 14632639], "13": ["vertical_and_slash", 100, 800, 1695539], "14": ["vertical_and_slash", 30, 800, 2605978], "15": ["vertical_and_slash", 1000, 4096, 12495330], "16": ["vertical_and_slash", 1000, 4096, 14564586], "17": ["vertical_and_slash", 500, 700, 962969], "18": ["vertical_and_slash", 1000, 4096, 12281016], "19": ["vertical_and_slash", 1000, 4096, 4614742], "20": ["vertical_and_slash", 100, 800, 11940535], "21": ["vertical_and_slash", 100, 800, 2445981], "22": ["vertical_and_slash", 100, 800, 2485005], "23": ["vertical_and_slash", 1000, 4096, 6864324], "24": ["vertical_and_slash", 1000, 4096, 16230551], "25": ["vertical_and_slash", 100, 800, 9358656], "26": ["vertical_and_slash", 100, 800, 14973598], "27": ["vertical_and_slash", 1000, 4096, 14250781], "28": ["vertical_and_slash", 1000, 4096, 18030248], "29": ["vertical_and_slash", 1000, 4096, 20247786], "30": ["vertical_and_slash", 1000, 4096, 12736495], "31": ["vertical_and_slash", 100, 800, 9012943]}, {"0": ["vertical_and_slash", 100, 800, 4792757], "1": ["vertical_and_slash", 100, 800, 5568805], "2": ["vertical_and_slash", 1000, 4096, 12086343], "3": ["vertical_and_slash", 100, 800, 7359182], "4": ["vertical_and_slash", 100, 800, 13719718], "5": ["vertical_and_slash", 1000, 4096, 17051068], "6": ["vertical_and_slash", 100, 800, 15947388], "7": ["vertical_and_slash", 1000, 4096, 9143327], "8": ["vertical_and_slash", 1000, 4096, 21263361], "9": ["vertical_and_slash", 1000, 4096, 17189141], "10": ["vertical_and_slash", 1000, 4096, 7802422], "11": ["vertical_and_slash", 1000, 4096, 18488560], "12": ["vertical_and_slash", 100, 800, 14938800], "13": ["vertical_and_slash", 100, 800, 11012944], "14": ["vertical_and_slash", 1000, 4096, 19104830], "15": ["vertical_and_slash", 3500, 100, 32379], "16": ["vertical_and_slash", 100, 800, 3067742], "17": ["vertical_and_slash", 100, 800, 1977488], "18": ["vertical_and_slash", 1000, 4096, 15351109], "19": ["vertical_and_slash", 30, 800, 1627281], "20": ["vertical_and_slash", 30, 800, 1280991], "21": ["vertical_and_slash", 100, 800, 12133497], "22": ["vertical_and_slash", 1000, 4096, 17870425], "23": ["vertical_and_slash", 30, 800, 4040253], "24": ["vertical_and_slash", 1000, 4096, 6272625], "25": ["vertical_and_slash", 100, 800, 1225145], "26": ["vertical_and_slash", 100, 800, 2746332], "27": ["vertical_and_slash", 100, 800, 4525182], "28": ["vertical_and_slash", 100, 800, 6274770], "29": ["vertical_and_slash", 100, 800, 6919161], "30": ["vertical_and_slash", 100, 800, 3456148], "31": ["vertical_and_slash", 100, 800, 23867]}, {"0": ["vertical_and_slash", 1000, 4096, 7275761], "1": ["vertical_and_slash", 100, 800, 5068315], "2": ["vertical_and_slash", 100, 800, 11162394], "3": ["vertical_and_slash", 100, 800, 3672939], "4": ["vertical_and_slash", 3500, 100, 20894613], "5": ["vertical_and_slash", 1000, 4096, 7938372], "6": ["vertical_and_slash", 100, 800, 12544912], "7": ["vertical_and_slash", 100, 800, 2008695], "8": ["vertical_and_slash", 1000, 4096, 3368310], "9": ["vertical_and_slash", 30, 800, 1508993], "10": ["vertical_and_slash", 1000, 4096, 3495386], "11": ["vertical_and_slash", 3500, 100, 16438193], "12": ["vertical_and_slash", 100, 800, 7069375], "13": ["vertical_and_slash", 100, 800, 10686684], "14": ["vertical_and_slash", 30, 800, 501489], "15": ["vertical_and_slash", 100, 800, 6067001], "16": ["vertical_and_slash", 100, 800, 6935788], "17": ["vertical_and_slash", 1000, 4096, 3300792], "18": ["vertical_and_slash", 100, 800, 7398154], "19": ["vertical_and_slash", 100, 800, 5788636], "20": ["vertical_and_slash", 100, 800, 4456802], "21": ["vertical_and_slash", 100, 800, 2680176], "22": ["vertical_and_slash", 100, 800, 5544567], "23": ["vertical_and_slash", 1000, 4096, 13475356], "24": ["vertical_and_slash", 1000, 4096, 4901727], "25": ["vertical_and_slash", 1000, 4096, 3768996], "26": ["vertical_and_slash", 1000, 4096, 5368869], "27": ["vertical_and_slash", 3500, 100, 14218181], "28": ["vertical_and_slash", 1000, 4096, 13003444], "29": ["vertical_and_slash", 1000, 4096, 5716382], "30": ["vertical_and_slash", 3500, 100, 19916116], "31": ["vertical_and_slash", 1000, 4096, 11776798]}, {"0": ["vertical_and_slash", 100, 800, 13001986], "1": ["vertical_and_slash", 1000, 4096, 7570569], "2": ["vertical_and_slash", 100, 800, 951160], "3": ["vertical_and_slash", 100, 800, 11933179], "4": ["vertical_and_slash", 30, 800, 5365811], "5": ["vertical_and_slash", 100, 800, 10272574], "6": ["vertical_and_slash", 1000, 4096, 6527670], "7": ["vertical_and_slash", 100, 800, 12930014], "8": ["vertical_and_slash", 100, 800, 359537], "9": ["vertical_and_slash", 100, 800, 10654966], "10": ["vertical_and_slash", 100, 800, 1330316], "11": ["vertical_and_slash", 100, 800, 9971156], "12": ["vertical_and_slash", 1000, 4096, 5781478], "13": ["vertical_and_slash", 100, 800, 6032127], "14": ["vertical_and_slash", 100, 800, 1418329], "15": ["vertical_and_slash", 100, 800, 13069922], "16": ["vertical_and_slash", 100, 800, 8547563], "17": ["vertical_and_slash", 100, 800, 970921], "18": ["vertical_and_slash", 1000, 4096, 9256328], "19": ["vertical_and_slash", 1000, 4096, 12447206], "20": ["vertical_and_slash", 100, 800, 153856], "21": ["vertical_and_slash", 100, 800, 8022371], "22": ["vertical_and_slash", 3500, 100, 18626483], "23": ["vertical_and_slash", 100, 800, 3180643], "24": ["vertical_and_slash", 30, 800, 3549186], "25": ["vertical_and_slash", 100, 800, 2600992], "26": ["vertical_and_slash", 3500, 100, 21080570], "27": ["vertical_and_slash", 1000, 4096, 2995096], "28": ["vertical_and_slash", 30, 800, 13324952], "29": ["vertical_and_slash", 100, 800, 7015426], "30": ["vertical_and_slash", 100, 800, 17142326], "31": ["vertical_and_slash", 30, 800, 2059831]}, {"0": ["vertical_and_slash", 100, 800, 336984], "1": ["vertical_and_slash", 1000, 4096, 11908787], "2": ["vertical_and_slash", 1000, 4096, 11465673], "3": ["vertical_and_slash", 1000, 4096, 3870378], "4": ["vertical_and_slash", 1000, 4096, 1000373], "5": ["vertical_and_slash", 1000, 4096, 6450804], "6": ["vertical_and_slash", 1000, 4096, 6602987], "7": ["vertical_and_slash", 1000, 4096, 6552477], "8": ["vertical_and_slash", 30, 800, 8671938], "9": ["vertical_and_slash", 100, 800, 3906764], "10": ["vertical_and_slash", 1000, 4096, 7300294], "11": ["vertical_and_slash", 100, 800, 9068418], "12": ["vertical_and_slash", 100, 800, 5573415], "13": ["vertical_and_slash", 100, 800, 4302354], "14": ["vertical_and_slash", 30, 800, 969401], "15": ["vertical_and_slash", 100, 800, 132492], "16": ["vertical_and_slash", 1000, 4096, 10575265], "17": ["vertical_and_slash", 30, 800, 114557], "18": ["vertical_and_slash", 1000, 4096, 1669778], "19": ["vertical_and_slash", 30, 800, 244697], "20": ["vertical_and_slash", 30, 800, 401989], "21": ["vertical_and_slash", 1000, 4096, 257876], "22": ["vertical_and_slash", 100, 800, 1656276], "23": ["vertical_and_slash", 100, 800, 6627755], "24": ["vertical_and_slash", 100, 800, 17069094], "25": ["vertical_and_slash", 1000, 4096, 17310922], "26": ["vertical_and_slash", 3500, 100, 19238326], "27": ["vertical_and_slash", 100, 800, 10416201], "28": ["vertical_and_slash", 1000, 4096, 9125015], "29": ["vertical_and_slash", 100, 800, 17113558], "30": ["vertical_and_slash", 1000, 4096, 12041930], "31": ["vertical_and_slash", 1000, 4096, 6060396]}, {"0": ["vertical_and_slash", 1000, 4096, 9259982], "1": ["vertical_and_slash", 1000, 4096, 8618567], "2": ["vertical_and_slash", 100, 800, 3876940], "3": ["vertical_and_slash", 1000, 4096, 12767960], "4": ["vertical_and_slash", 1000, 4096, 6112941], "5": ["vertical_and_slash", 1000, 4096, 9851048], "6": ["vertical_and_slash", 1000, 4096, 5763271], "7": ["vertical_and_slash", 1000, 4096, 12744434], "8": ["vertical_and_slash", 100, 800, 12512293], "9": ["vertical_and_slash", 1000, 4096, 2367543], "10": ["vertical_and_slash", 100, 800, 12342103], "11": ["vertical_and_slash", 100, 800, 3126675], "12": ["vertical_and_slash", 1000, 4096, 13617286], "13": ["vertical_and_slash", 1000, 4096, 8094518], "14": ["vertical_and_slash", 1000, 4096, 851614], "15": ["vertical_and_slash", 1000, 4096, 10519480], "16": ["vertical_and_slash", 100, 800, 1706372], "17": ["vertical_and_slash", 100, 800, 248757], "18": ["vertical_and_slash", 100, 800, 4394336], "19": ["vertical_and_slash", 100, 800, 1886529], "20": ["vertical_and_slash", 1000, 4096, 6486541], "21": ["vertical_and_slash", 100, 800, 1175436], "22": ["vertical_and_slash", 100, 800, 7864652], "23": ["vertical_and_slash", 100, 800, 1001917], "24": ["vertical_and_slash", 100, 800, 2494293], "25": ["vertical_and_slash", 1000, 4096, 7698995], "26": ["vertical_and_slash", 100, 800, 2946712], "27": ["vertical_and_slash", 100, 800, 5464103], "28": ["vertical_and_slash", 100, 800, 2608538], "29": ["vertical_and_slash", 100, 800, 1606308], "30": ["vertical_and_slash", 1000, 4096, 5981702], "31": ["vertical_and_slash", 3500, 100, 18590832]}, {"0": ["vertical_and_slash", 100, 800, 4688244], "1": ["vertical_and_slash", 100, 800, 11368272], "2": ["vertical_and_slash", 100, 800, 2558719], "3": ["vertical_and_slash", 1000, 4096, 9536926], "4": ["vertical_and_slash", 1000, 4096, 12315283], "5": ["vertical_and_slash", 1000, 4096, 6272119], "6": ["vertical_and_slash", 1000, 4096, 4450200], "7": ["vertical_and_slash", 100, 800, 5822568], "8": ["vertical_and_slash", 1000, 4096, 13523232], "9": ["vertical_and_slash", 100, 800, 816607], "10": ["vertical_and_slash", 1000, 4096, 15825338], "11": ["vertical_and_slash", 100, 800, 1133867], "12": ["vertical_and_slash", 100, 800, 10722989], "13": ["vertical_and_slash", 100, 800, 2466001], "14": ["vertical_and_slash", 100, 800, 16732584], "15": ["vertical_and_slash", 100, 800, 1052553], "16": ["vertical_and_slash", 100, 800, 8602649], "17": ["vertical_and_slash", 100, 800, 8851217], "18": ["vertical_and_slash", 100, 800, 6104130], "19": ["vertical_and_slash", 1000, 4096, 18459502], "20": ["vertical_and_slash", 100, 800, 8076967], "21": ["vertical_and_slash", 1000, 4096, 4863209], "22": ["vertical_and_slash", 1000, 4096, 8892415], "23": ["vertical_and_slash", 1000, 4096, 9542798], "24": ["vertical_and_slash", 100, 800, 1384183], "25": ["vertical_and_slash", 100, 800, 4035455], "26": ["vertical_and_slash", 100, 800, 536763], "27": ["vertical_and_slash", 1000, 4096, 2058585], "28": ["vertical_and_slash", 100, 800, 4195607], "29": ["vertical_and_slash", 100, 800, 2407136], "30": ["vertical_and_slash", 100, 800, 2106926], "31": ["vertical_and_slash", 100, 800, 3807607]}, {"0": ["vertical_and_slash", 100, 800, 15975096], "1": ["vertical_and_slash", 3500, 100, 20664973], "2": ["vertical_and_slash", 1000, 4096, 943914], "3": ["vertical_and_slash", 100, 800, 14363276], "4": ["vertical_and_slash", 100, 800, 720326], "5": ["vertical_and_slash", 1000, 4096, 7725879], "6": ["vertical_and_slash", 1000, 4096, 11411255], "7": ["vertical_and_slash", 1000, 4096, 9492657], "8": ["vertical_and_slash", 1000, 4096, 16448227], "9": ["vertical_and_slash", 100, 800, 6180918], "10": ["vertical_and_slash", 1000, 4096, 10942342], "11": ["vertical_and_slash", 1000, 4096, 12047657], "12": ["vertical_and_slash", 100, 800, 2376658], "13": ["vertical_and_slash", 1000, 4096, 17780083], "14": ["vertical_and_slash", 1000, 4096, 8548356], "15": ["vertical_and_slash", 100, 800, 4545880], "16": ["vertical_and_slash", 30, 800, 2020350], "17": ["vertical_and_slash", 100, 800, 15875867], "18": ["vertical_and_slash", 30, 800, 661201], "19": ["vertical_and_slash", 1000, 4096, 14915782], "20": ["vertical_and_slash", 100, 800, 4106388], "21": ["vertical_and_slash", 30, 800, 14163451], "22": ["vertical_and_slash", 100, 800, 1759639], "23": ["vertical_and_slash", 1000, 4096, 2391070], "24": ["vertical_and_slash", 100, 800, 10749758], "25": ["vertical_and_slash", 100, 800, 8022438], "26": ["vertical_and_slash", 100, 800, 1013941], "27": ["vertical_and_slash", 100, 800, 3537516], "28": ["vertical_and_slash", 100, 800, 1252545], "29": ["vertical_and_slash", 100, 800, 1155740], "30": ["vertical_and_slash", 1000, 4096, 2590667], "31": ["vertical_and_slash", 100, 800, 3320946]}, {"0": ["vertical_and_slash", 1000, 4096, 8025205], "1": ["vertical_and_slash", 500, 700, 2286667], "2": ["vertical_and_slash", 1000, 4096, 2104863], "3": ["vertical_and_slash", 1000, 4096, 2160060], "4": ["vertical_and_slash", 1000, 4096, 4209178], "5": ["vertical_and_slash", 1000, 4096, 5703899], "6": ["vertical_and_slash", 100, 800, 15566139], "7": ["vertical_and_slash", 500, 700, 464012], "8": ["vertical_and_slash", 1000, 4096, 632556], "9": ["vertical_and_slash", 1000, 4096, 10933130], "10": ["vertical_and_slash", 3500, 100, 6376023], "11": ["vertical_and_slash", 30, 800, 53293], "12": ["vertical_and_slash", 3500, 100, 9195722], "13": ["vertical_and_slash", 100, 800, 130891], "14": ["vertical_and_slash", 100, 800, 1266310], "15": ["vertical_and_slash", 100, 800, 12042893], "16": ["vertical_and_slash", 100, 800, 1440252], "17": ["vertical_and_slash", 100, 800, 5003178], "18": ["vertical_and_slash", 100, 800, 9451180], "19": ["vertical_and_slash", 100, 800, 16518635], "20": ["vertical_and_slash", 1000, 4096, 16574448], "21": ["vertical_and_slash", 100, 800, 10001073], "22": ["vertical_and_slash", 100, 800, 6194150], "23": ["vertical_and_slash", 100, 800, 1990080], "24": ["vertical_and_slash", 100, 800, 14105574], "25": ["vertical_and_slash", 3500, 100, 49578], "26": ["vertical_and_slash", 100, 800, 1368613], "27": ["vertical_and_slash", 100, 800, 882483], "28": ["vertical_and_slash", 100, 800, 200592], "29": ["vertical_and_slash", 100, 800, 4144857], "30": ["vertical_and_slash", 30, 800, 2059620], "31": ["vertical_and_slash", 1000, 4096, 7650136]}, {"0": ["vertical_and_slash", 3500, 100, 20200147], "1": ["vertical_and_slash", 100, 800, 18033672], "2": ["vertical_and_slash", 100, 800, 19227421], "3": ["vertical_and_slash", 1000, 4096, 7658465], "4": ["vertical_and_slash", 100, 800, 4862174], "5": ["vertical_and_slash", 100, 800, 6197824], "6": ["vertical_and_slash", 100, 800, 5687873], "7": ["vertical_and_slash", 100, 800, 13005015], "8": ["vertical_and_slash", 1000, 4096, 6677727], "9": ["vertical_and_slash", 500, 700, 1282697], "10": ["vertical_and_slash", 30, 800, 3148411], "11": ["vertical_and_slash", 500, 700, 8985965], "12": ["vertical_and_slash", 100, 800, 11107850], "13": ["vertical_and_slash", 30, 800, 2077544], "14": ["vertical_and_slash", 1000, 4096, 10030857], "15": ["vertical_and_slash", 100, 800, 1625067], "16": ["vertical_and_slash", 100, 800, 332660], "17": ["vertical_and_slash", 3500, 100, 17539067], "18": ["vertical_and_slash", 500, 700, 97483], "19": ["vertical_and_slash", 30, 800, 10910089], "20": ["vertical_and_slash", 500, 700, 49927], "21": ["vertical_and_slash", 1000, 4096, 2959963], "22": ["vertical_and_slash", 1000, 4096, 1232910], "23": ["vertical_and_slash", 100, 800, 482216], "24": ["vertical_and_slash", 3500, 100, 2789809], "25": ["vertical_and_slash", 3500, 100, 1787013], "26": ["vertical_and_slash", 100, 800, 6121965], "27": ["vertical_and_slash", 100, 800, 10417031], "28": ["vertical_and_slash", 100, 800, 476098], "29": ["vertical_and_slash", 3500, 100, 13019985], "30": ["vertical_and_slash", 100, 800, 15057321], "31": ["vertical_and_slash", 100, 800, 7206530]}, {"0": ["vertical_and_slash", 30, 800, 3863946], "1": ["vertical_and_slash", 3500, 100, 373838], "2": ["vertical_and_slash", 30, 800, 2498107], "3": ["vertical_and_slash", 30, 800, 1774834], "4": ["vertical_and_slash", 30, 800, 13518574], "5": ["vertical_and_slash", 30, 800, 17864279], "6": ["vertical_and_slash", 30, 800, 4971247], "7": ["vertical_and_slash", 30, 800, 15064092], "8": ["vertical_and_slash", 1000, 4096, 173702], "9": ["vertical_and_slash", 100, 800, 2079528], "10": ["vertical_and_slash", 1000, 4096, 1395995], "11": ["vertical_and_slash", 100, 800, 16807189], "12": ["vertical_and_slash", 1000, 4096, 3387818], "13": ["vertical_and_slash", 1000, 4096, 215373], "14": ["vertical_and_slash", 1000, 4096, 7656048], "15": ["vertical_and_slash", 1000, 4096, 3284167], "16": ["vertical_and_slash", 100, 800, 208560], "17": ["vertical_and_slash", 100, 800, 12910224], "18": ["vertical_and_slash", 100, 800, 2482406], "19": ["vertical_and_slash", 100, 800, 591300], "20": ["vertical_and_slash", 500, 700, 2512230], "21": ["vertical_and_slash", 100, 800, 650819], "22": ["vertical_and_slash", 100, 800, 750172], "23": ["vertical_and_slash", 100, 800, 98380], "24": ["vertical_and_slash", 1000, 4096, 12591674], "25": ["vertical_and_slash", 100, 800, 7520129], "26": ["vertical_and_slash", 3500, 100, 19780031], "27": ["vertical_and_slash", 1000, 4096, 11324806], "28": ["vertical_and_slash", 100, 800, 2339301], "29": ["vertical_and_slash", 3500, 100, 20537162], "30": ["vertical_and_slash", 100, 800, 1802458], "31": ["vertical_and_slash", 1000, 4096, 4121953]}, {"0": ["vertical_and_slash", 100, 800, 1406058], "1": ["vertical_and_slash", 30, 800, 20495], "2": ["vertical_and_slash", 100, 800, 265247], "3": ["vertical_and_slash", 30, 800, 6044172], "4": ["vertical_and_slash", 100, 800, 15417162], "5": ["vertical_and_slash", 100, 800, 20101], "6": ["vertical_and_slash", 30, 800, 12443], "7": ["vertical_and_slash", 100, 800, 1029], "8": ["vertical_and_slash", 30, 800, 49334], "9": ["vertical_and_slash", 30, 800, 30976], "10": ["vertical_and_slash", 30, 800, 127540], "11": ["vertical_and_slash", 30, 800, 3597689], "12": ["vertical_and_slash", 30, 800, 32317], "13": ["vertical_and_slash", 30, 800, 202557], "14": ["vertical_and_slash", 30, 800, 531805], "15": ["vertical_and_slash", 30, 800, 606518], "16": ["vertical_and_slash", 30, 800, 1152706], "17": ["vertical_and_slash", 1000, 4096, 5604379], "18": ["vertical_and_slash", 30, 800, 663403], "19": ["vertical_and_slash", 1000, 4096, 11655952], "20": ["vertical_and_slash", 100, 800, 15102172], "21": ["vertical_and_slash", 100, 800, 4674143], "22": ["vertical_and_slash", 500, 700, 1539328], "23": ["vertical_and_slash", 100, 800, 3051857], "24": ["vertical_and_slash", 30, 800, 123576], "25": ["vertical_and_slash", 100, 800, 964667], "26": ["vertical_and_slash", 30, 800, 41505], "27": ["vertical_and_slash", 30, 800, 59560], "28": ["vertical_and_slash", 100, 800, 17208], "29": ["vertical_and_slash", 30, 800, 82626], "30": ["vertical_and_slash", 30, 800, 1815531], "31": ["vertical_and_slash", 100, 800, 2897668]}, {"0": ["vertical_and_slash", 30, 800, 48323], "1": ["vertical_and_slash", 30, 800, 689675], "2": ["vertical_and_slash", 30, 800, 542041], "3": ["vertical_and_slash", 30, 800, 8544], "4": ["vertical_and_slash", 30, 800, 102588], "5": ["vertical_and_slash", 100, 800, 2064154], "6": ["vertical_and_slash", 30, 800, 845227], "7": ["vertical_and_slash", 30, 800, 2922720], "8": ["vertical_and_slash", 1000, 4096, 2932415], "9": ["vertical_and_slash", 1000, 4096, 3062180], "10": ["vertical_and_slash", 100, 800, 485119], "11": ["vertical_and_slash", 30, 800, 215049], "12": ["vertical_and_slash", 100, 800, 387511], "13": ["vertical_and_slash", 100, 800, 1447813], "14": ["vertical_and_slash", 1000, 4096, 3878389], "15": ["vertical_and_slash", 100, 800, 376333], "16": ["vertical_and_slash", 3500, 100, 13506969], "17": ["vertical_and_slash", 100, 800, 12850708], "18": ["vertical_and_slash", 30, 800, 372529], "19": ["vertical_and_slash", 1000, 4096, 3746168], "20": ["vertical_and_slash", 100, 800, 170359], "21": ["vertical_and_slash", 100, 800, 1130785], "22": ["vertical_and_slash", 100, 800, 116224], "23": ["vertical_and_slash", 100, 800, 1001182], "24": ["vertical_and_slash", 100, 800, 335681], "25": ["vertical_and_slash", 100, 800, 3392285], "26": ["vertical_and_slash", 1000, 4096, 4420760], "27": ["vertical_and_slash", 3500, 100, 12258981], "28": ["vertical_and_slash", 500, 700, 1941188], "29": ["vertical_and_slash", 1000, 4096, 7639240], "30": ["vertical_and_slash", 500, 700, 8277346], "31": ["vertical_and_slash", 3500, 100, 3442659]}, {"0": ["vertical_and_slash", 30, 800, 945264], "1": ["vertical_and_slash", 1000, 4096, 3474994], "2": ["vertical_and_slash", 500, 700, 218918], "3": ["vertical_and_slash", 3500, 100, 20221076], "4": ["vertical_and_slash", 3500, 100, 21680113], "5": ["vertical_and_slash", 30, 800, 94866], "6": ["vertical_and_slash", 30, 800, 190907], "7": ["vertical_and_slash", 1000, 4096, 1708889], "8": ["vertical_and_slash", 100, 800, 2832752], "9": ["vertical_and_slash", 1000, 4096, 613061], "10": ["vertical_and_slash", 1000, 4096, 7381575], "11": ["vertical_and_slash", 1000, 4096, 1462120], "12": ["vertical_and_slash", 1000, 4096, 3338671], "13": ["vertical_and_slash", 100, 800, 1664528], "14": ["vertical_and_slash", 500, 700, 143074], "15": ["vertical_and_slash", 30, 800, 433035], "16": ["vertical_and_slash", 500, 700, 210886], "17": ["vertical_and_slash", 100, 800, 8632139], "18": ["vertical_and_slash", 100, 800, 17521811], "19": ["vertical_and_slash", 30, 800, 194306], "20": ["vertical_and_slash", 100, 800, 3156950], "21": ["vertical_and_slash", 100, 800, 2413125], "22": ["vertical_and_slash", 1000, 4096, 10110205], "23": ["vertical_and_slash", 100, 800, 695569], "24": ["vertical_and_slash", 30, 800, 32256], "25": ["vertical_and_slash", 30, 800, 396762], "26": ["vertical_and_slash", 30, 800, 726815], "27": ["vertical_and_slash", 30, 800, 499056], "28": ["vertical_and_slash", 30, 800, 24234], "29": ["vertical_and_slash", 30, 800, 87299], "30": ["vertical_and_slash", 30, 800, 82758], "31": ["vertical_and_slash", 30, 800, 447266]}, {"0": ["vertical_and_slash", 100, 800, 13520320], "1": ["vertical_and_slash", 100, 800, 1746572], "2": ["vertical_and_slash", 100, 800, 81358], "3": ["vertical_and_slash", 100, 800, 53915], "4": ["vertical_and_slash", 100, 800, 16824352], "5": ["vertical_and_slash", 100, 800, 124419], "6": ["vertical_and_slash", 100, 800, 5336412], "7": ["vertical_and_slash", 100, 800, 1005227], "8": ["vertical_and_slash", 1000, 4096, 17919472], "9": ["vertical_and_slash", 100, 800, 5089389], "10": ["vertical_and_slash", 1000, 4096, 2318753], "11": ["vertical_and_slash", 100, 800, 2351529], "12": ["vertical_and_slash", 1000, 4096, 1068220], "13": ["vertical_and_slash", 1000, 4096, 18765314], "14": ["vertical_and_slash", 1000, 4096, 11512280], "15": ["vertical_and_slash", 1000, 4096, 14722530], "16": ["vertical_and_slash", 100, 800, 1542041], "17": ["vertical_and_slash", 3500, 100, 19279869], "18": ["vertical_and_slash", 100, 800, 4711439], "19": ["vertical_and_slash", 3500, 100, 3688560], "20": ["vertical_and_slash", 3500, 100, 224250], "21": ["vertical_and_slash", 100, 800, 10537230], "22": ["vertical_and_slash", 100, 800, 749819], "23": ["vertical_and_slash", 100, 800, 25187], "24": ["vertical_and_slash", 100, 800, 13068183], "25": ["vertical_and_slash", 100, 800, 17508351], "26": ["vertical_and_slash", 100, 800, 12981109], "27": ["vertical_and_slash", 100, 800, 15314279], "28": ["vertical_and_slash", 100, 800, 15558838], "29": ["vertical_and_slash", 100, 800, 3774507], "30": ["vertical_and_slash", 100, 800, 6486179], "31": ["vertical_and_slash", 100, 800, 15420283]}, {"0": ["vertical_and_slash", 100, 800, 1793383], "1": ["vertical_and_slash", 100, 800, 8103093], "2": ["vertical_and_slash", 1000, 4096, 12596743], "3": ["vertical_and_slash", 1000, 4096, 5012316], "4": ["vertical_and_slash", 1000, 4096, 12870742], "5": ["vertical_and_slash", 100, 800, 3459141], "6": ["vertical_and_slash", 30, 800, 10224901], "7": ["vertical_and_slash", 100, 800, 3753981], "8": ["vertical_and_slash", 30, 800, 140040], "9": ["vertical_and_slash", 30, 800, 550671], "10": ["vertical_and_slash", 100, 800, 94454], "11": ["vertical_and_slash", 30, 800, 8909], "12": ["vertical_and_slash", 30, 800, 152077], "13": ["vertical_and_slash", 30, 800, 49171], "14": ["vertical_and_slash", 30, 800, 107813], "15": ["vertical_and_slash", 30, 800, 128764], "16": ["vertical_and_slash", 30, 800, 617322], "17": ["vertical_and_slash", 1000, 4096, 6019612], "18": ["vertical_and_slash", 100, 800, 766582], "19": ["vertical_and_slash", 30, 800, 52503], "20": ["vertical_and_slash", 30, 800, 300294], "21": ["vertical_and_slash", 30, 800, 1577098], "22": ["vertical_and_slash", 100, 800, 838126], "23": ["vertical_and_slash", 100, 800, 1218912], "24": ["vertical_and_slash", 100, 800, 1720664], "25": ["vertical_and_slash", 100, 800, 1377743], "26": ["vertical_and_slash", 1000, 4096, 900287], "27": ["vertical_and_slash", 1000, 4096, 12066126], "28": ["vertical_and_slash", 1000, 4096, 14264762], "29": ["vertical_and_slash", 1000, 4096, 71284], "30": ["vertical_and_slash", 1000, 4096, 3218291], "31": ["vertical_and_slash", 1000, 4096, 13215387]}, {"0": ["vertical_and_slash", 100, 800, 18645971], "1": ["vertical_and_slash", 30, 800, 587932], "2": ["vertical_and_slash", 1000, 4096, 10538505], "3": ["vertical_and_slash", 30, 800, 158559], "4": ["vertical_and_slash", 100, 800, 3376593], "5": ["vertical_and_slash", 100, 800, 18383338], "6": ["vertical_and_slash", 1000, 4096, 10074810], "7": ["vertical_and_slash", 1000, 4096, 19347044], "8": ["vertical_and_slash", 1000, 4096, 6794450], "9": ["vertical_and_slash", 1000, 4096, 3529136], "10": ["vertical_and_slash", 1000, 4096, 6952639], "11": ["vertical_and_slash", 1000, 4096, 9362393], "12": ["vertical_and_slash", 1000, 4096, 5368732], "13": ["vertical_and_slash", 100, 800, 705065], "14": ["vertical_and_slash", 100, 800, 628184], "15": ["vertical_and_slash", 1000, 4096, 7575979], "16": ["vertical_and_slash", 1000, 4096, 14825324], "17": ["vertical_and_slash", 100, 800, 584190], "18": ["vertical_and_slash", 1000, 4096, 14770220], "19": ["vertical_and_slash", 100, 800, 7324628], "20": ["vertical_and_slash", 100, 800, 13439080], "21": ["vertical_and_slash", 100, 800, 2173728], "22": ["vertical_and_slash", 100, 800, 1300676], "23": ["vertical_and_slash", 3500, 100, 20507565], "24": ["vertical_and_slash", 3500, 100, 20826931], "25": ["vertical_and_slash", 100, 800, 16503925], "26": ["vertical_and_slash", 3500, 100, 20607984], "27": ["vertical_and_slash", 1000, 4096, 9100775], "28": ["vertical_and_slash", 3500, 100, 20540180], "29": ["vertical_and_slash", 1000, 4096, 19978707], "30": ["vertical_and_slash", 100, 800, 18084829], "31": ["vertical_and_slash", 100, 800, 15584755]}, {"0": ["vertical_and_slash", 100, 800, 14519032], "1": ["vertical_and_slash", 100, 800, 13637880], "2": ["vertical_and_slash", 3500, 100, 19712241], "3": ["vertical_and_slash", 100, 800, 14417159], "4": ["vertical_and_slash", 100, 800, 18931772], "5": ["vertical_and_slash", 3500, 100, 20278735], "6": ["vertical_and_slash", 100, 800, 21000177], "7": ["vertical_and_slash", 3500, 100, 20181815], "8": ["vertical_and_slash", 1000, 4096, 20667264], "9": ["vertical_and_slash", 1000, 4096, 13546806], "10": ["vertical_and_slash", 1000, 4096, 8056555], "11": ["vertical_and_slash", 1000, 4096, 14544259], "12": ["vertical_and_slash", 3500, 100, 14988539], "13": ["vertical_and_slash", 100, 800, 9925552], "14": ["vertical_and_slash", 1000, 4096, 16502140], "15": ["vertical_and_slash", 3500, 100, 1394], "16": ["vertical_and_slash", 100, 800, 6786191], "17": ["vertical_and_slash", 100, 800, 5142369], "18": ["vertical_and_slash", 1000, 4096, 18139060], "19": ["vertical_and_slash", 100, 800, 1817633], "20": ["vertical_and_slash", 100, 800, 1586931], "21": ["vertical_and_slash", 1000, 4096, 2981991], "22": ["vertical_and_slash", 1000, 4096, 19814245], "23": ["vertical_and_slash", 100, 800, 3823591], "24": ["vertical_and_slash", 1000, 4096, 11968181], "25": ["vertical_and_slash", 100, 800, 4245870], "26": ["vertical_and_slash", 100, 800, 6065658], "27": ["vertical_and_slash", 100, 800, 17099315], "28": ["vertical_and_slash", 100, 800, 14002976], "29": ["vertical_and_slash", 100, 800, 15062395], "30": ["vertical_and_slash", 3500, 100, 9832421], "31": ["vertical_and_slash", 100, 800, 329163]}, {"0": ["vertical_and_slash", 100, 800, 17881284], "1": ["vertical_and_slash", 100, 800, 6096065], "2": ["vertical_and_slash", 100, 800, 19512309], "3": ["vertical_and_slash", 100, 800, 1361094], "4": ["vertical_and_slash", 3500, 100, 21385650], "5": ["vertical_and_slash", 100, 800, 14152330], "6": ["vertical_and_slash", 100, 800, 15379238], "7": ["vertical_and_slash", 100, 800, 936209], "8": ["vertical_and_slash", 3500, 100, 7644919], "9": ["vertical_and_slash", 100, 800, 162434], "10": ["vertical_and_slash", 100, 800, 11548456], "11": ["vertical_and_slash", 100, 800, 11141282], "12": ["vertical_and_slash", 3500, 100, 6011727], "13": ["vertical_and_slash", 100, 800, 16026110], "14": ["vertical_and_slash", 100, 800, 466578], "15": ["vertical_and_slash", 100, 800, 4799040], "16": ["vertical_and_slash", 100, 800, 15252019], "17": ["vertical_and_slash", 1000, 4096, 7350605], "18": ["vertical_and_slash", 100, 800, 16896477], "19": ["vertical_and_slash", 1000, 4096, 5715502], "20": ["vertical_and_slash", 100, 800, 9885275], "21": ["vertical_and_slash", 100, 800, 8062274], "22": ["vertical_and_slash", 100, 800, 11341966], "23": ["vertical_and_slash", 3500, 100, 21639689], "24": ["vertical_and_slash", 1000, 4096, 7313536], "25": ["vertical_and_slash", 1000, 4096, 1858640], "26": ["vertical_and_slash", 100, 800, 17665215], "27": ["vertical_and_slash", 100, 800, 13827567], "28": ["vertical_and_slash", 1000, 4096, 16279088], "29": ["vertical_and_slash", 1000, 4096, 2728376], "30": ["vertical_and_slash", 1000, 4096, 20378804], "31": ["vertical_and_slash", 1000, 4096, 11218561]}, {"0": ["vertical_and_slash", 100, 800, 10702989], "1": ["vertical_and_slash", 100, 800, 13911357], "2": ["vertical_and_slash", 100, 800, 2089505], "3": ["vertical_and_slash", 100, 800, 5795130], "4": ["vertical_and_slash", 100, 800, 6198580], "5": ["vertical_and_slash", 100, 800, 11025874], "6": ["vertical_and_slash", 1000, 4096, 4765707], "7": ["vertical_and_slash", 100, 800, 9275261], "8": ["vertical_and_slash", 100, 800, 356772], "9": ["vertical_and_slash", 100, 800, 6507763], "10": ["vertical_and_slash", 100, 800, 1057022], "11": ["vertical_and_slash", 100, 800, 16390639], "12": ["vertical_and_slash", 1000, 4096, 6504148], "13": ["vertical_and_slash", 100, 800, 5815163], "14": ["vertical_and_slash", 100, 800, 781258], "15": ["vertical_and_slash", 1000, 4096, 5306413], "16": ["vertical_and_slash", 100, 800, 7571947], "17": ["vertical_and_slash", 100, 800, 2246584], "18": ["vertical_and_slash", 1000, 4096, 6370179], "19": ["vertical_and_slash", 1000, 4096, 16329738], "20": ["vertical_and_slash", 100, 800, 810202], "21": ["vertical_and_slash", 100, 800, 9614219], "22": ["vertical_and_slash", 3500, 100, 21023608], "23": ["vertical_and_slash", 100, 800, 3697853], "24": ["vertical_and_slash", 500, 700, 623385], "25": ["vertical_and_slash", 100, 800, 2872545], "26": ["vertical_and_slash", 3500, 100, 21443890], "27": ["vertical_and_slash", 1000, 4096, 964593], "28": ["vertical_and_slash", 1000, 4096, 6046647], "29": ["vertical_and_slash", 1000, 4096, 3390663], "30": ["vertical_and_slash", 3500, 100, 21396110], "31": ["vertical_and_slash", 500, 700, 1185821]}, {"0": ["vertical_and_slash", 100, 800, 929038], "1": ["vertical_and_slash", 1000, 4096, 11917459], "2": ["vertical_and_slash", 1000, 4096, 11189817], "3": ["vertical_and_slash", 1000, 4096, 5290948], "4": ["vertical_and_slash", 100, 800, 2444153], "5": ["vertical_and_slash", 1000, 4096, 7367448], "6": ["vertical_and_slash", 1000, 4096, 3929914], "7": ["vertical_and_slash", 1000, 4096, 2907293], "8": ["vertical_and_slash", 30, 800, 8631190], "9": ["vertical_and_slash", 100, 800, 7657567], "10": ["vertical_and_slash", 1000, 4096, 5754225], "11": ["vertical_and_slash", 100, 800, 16484372], "12": ["vertical_and_slash", 100, 800, 7369987], "13": ["vertical_and_slash", 100, 800, 3365312], "14": ["vertical_and_slash", 30, 800, 461151], "15": ["vertical_and_slash", 500, 700, 315608], "16": ["vertical_and_slash", 1000, 4096, 16240364], "17": ["vertical_and_slash", 100, 800, 253597], "18": ["vertical_and_slash", 1000, 4096, 925109], "19": ["vertical_and_slash", 100, 800, 133339], "20": ["vertical_and_slash", 100, 800, 578256], "21": ["vertical_and_slash", 1000, 4096, 1817521], "22": ["vertical_and_slash", 3500, 100, 4918245], "23": ["vertical_and_slash", 1000, 4096, 114317], "24": ["vertical_and_slash", 3500, 100, 20949654], "25": ["vertical_and_slash", 3500, 100, 21380515], "26": ["vertical_and_slash", 1000, 4096, 20796309], "27": ["vertical_and_slash", 100, 800, 11897642], "28": ["vertical_and_slash", 1000, 4096, 17534343], "29": ["vertical_and_slash", 1000, 4096, 20051889], "30": ["vertical_and_slash", 1000, 4096, 20184777], "31": ["vertical_and_slash", 3500, 100, 20262011]}, {"0": ["vertical_and_slash", 1000, 4096, 8179346], "1": ["vertical_and_slash", 1000, 4096, 2423899], "2": ["vertical_and_slash", 100, 800, 13818895], "3": ["vertical_and_slash", 1000, 4096, 6522601], "4": ["vertical_and_slash", 1000, 4096, 1060263], "5": ["vertical_and_slash", 1000, 4096, 4157137], "6": ["vertical_and_slash", 1000, 4096, 6990380], "7": ["vertical_and_slash", 1000, 4096, 10763715], "8": ["vertical_and_slash", 100, 800, 10123257], "9": ["vertical_and_slash", 1000, 4096, 9156840], "10": ["vertical_and_slash", 1000, 4096, 16029616], "11": ["vertical_and_slash", 100, 800, 1673944], "12": ["vertical_and_slash", 1000, 4096, 15001358], "13": ["vertical_and_slash", 1000, 4096, 11496585], "14": ["vertical_and_slash", 100, 800, 9006039], "15": ["vertical_and_slash", 1000, 4096, 13032008], "16": ["vertical_and_slash", 100, 800, 4813070], "17": ["vertical_and_slash", 100, 800, 1475285], "18": ["vertical_and_slash", 100, 800, 8000337], "19": ["vertical_and_slash", 100, 800, 8837856], "20": ["vertical_and_slash", 1000, 4096, 16977677], "21": ["vertical_and_slash", 100, 800, 4416649], "22": ["vertical_and_slash", 100, 800, 17025902], "23": ["vertical_and_slash", 100, 800, 602195], "24": ["vertical_and_slash", 3500, 100, 5765045], "25": ["vertical_and_slash", 100, 800, 13009069], "26": ["vertical_and_slash", 100, 800, 3523767], "27": ["vertical_and_slash", 100, 800, 6546733], "28": ["vertical_and_slash", 3500, 100, 3452012], "29": ["vertical_and_slash", 100, 800, 1510491], "30": ["vertical_and_slash", 3500, 100, 17227596], "31": ["vertical_and_slash", 3500, 100, 19660969]}, {"0": ["vertical_and_slash", 3500, 100, 6623789], "1": ["vertical_and_slash", 3500, 100, 3902994], "2": ["vertical_and_slash", 3500, 100, 6994928], "3": ["vertical_and_slash", 1000, 4096, 5149770], "4": ["vertical_and_slash", 3500, 100, 14836158], "5": ["vertical_and_slash", 100, 800, 17515427], "6": ["vertical_and_slash", 3500, 100, 7911558], "7": ["vertical_and_slash", 3500, 100, 9338861], "8": ["vertical_and_slash", 3500, 100, 14090410], "9": ["vertical_and_slash", 100, 800, 2492955], "10": ["vertical_and_slash", 3500, 100, 21732500], "11": ["vertical_and_slash", 100, 800, 2898121], "12": ["vertical_and_slash", 3500, 100, 10852444], "13": ["vertical_and_slash", 100, 800, 1940039], "14": ["vertical_and_slash", 3500, 100, 16338195], "15": ["vertical_and_slash", 100, 800, 2006495], "16": ["vertical_and_slash", 3500, 100, 10259390], "17": ["vertical_and_slash", 100, 800, 4065419], "18": ["vertical_and_slash", 100, 800, 12733273], "19": ["vertical_and_slash", 1000, 4096, 11751394], "20": ["vertical_and_slash", 100, 800, 15251186], "21": ["vertical_and_slash", 1000, 4096, 12287035], "22": ["vertical_and_slash", 1000, 4096, 5114508], "23": ["vertical_and_slash", 1000, 4096, 13162100], "24": ["vertical_and_slash", 100, 800, 8000122], "25": ["vertical_and_slash", 100, 800, 9281634], "26": ["vertical_and_slash", 100, 800, 1846488], "27": ["vertical_and_slash", 3500, 100, 8590692], "28": ["vertical_and_slash", 100, 800, 8643063], "29": ["vertical_and_slash", 100, 800, 5758817], "30": ["vertical_and_slash", 100, 800, 5877183], "31": ["vertical_and_slash", 100, 800, 7796906]}, {"0": ["vertical_and_slash", 100, 800, 20597532], "1": ["vertical_and_slash", 3500, 100, 21758452], "2": ["vertical_and_slash", 1000, 4096, 4144141], "3": ["vertical_and_slash", 100, 800, 20261887], "4": ["vertical_and_slash", 1000, 4096, 2512370], "5": ["vertical_and_slash", 3500, 100, 17706009], "6": ["vertical_and_slash", 1000, 4096, 19693735], "7": ["vertical_and_slash", 1000, 4096, 12879585], "8": ["vertical_and_slash", 3500, 100, 18330550], "9": ["vertical_and_slash", 1000, 4096, 395315], "10": ["vertical_and_slash", 100, 800, 12936460], "11": ["vertical_and_slash", 3500, 100, 20489362], "12": ["vertical_and_slash", 100, 800, 2920447], "13": ["vertical_and_slash", 3500, 100, 19704987], "14": ["vertical_and_slash", 3500, 100, 19332279], "15": ["vertical_and_slash", 100, 800, 8771256], "16": ["vertical_and_slash", 100, 800, 5611994], "17": ["vertical_and_slash", 100, 800, 16087138], "18": ["vertical_and_slash", 500, 700, 891236], "19": ["vertical_and_slash", 3500, 100, 21427139], "20": ["vertical_and_slash", 100, 800, 1823410], "21": ["vertical_and_slash", 30, 800, 15408418], "22": ["vertical_and_slash", 500, 700, 9266226], "23": ["vertical_and_slash", 3500, 100, 17195724], "24": ["vertical_and_slash", 1000, 4096, 7809063], "25": ["vertical_and_slash", 100, 800, 14083150], "26": ["vertical_and_slash", 100, 800, 4139113], "27": ["vertical_and_slash", 100, 800, 10706318], "28": ["vertical_and_slash", 1000, 4096, 1105380], "29": ["vertical_and_slash", 100, 800, 3630717], "30": ["vertical_and_slash", 1000, 4096, 10664933], "31": ["vertical_and_slash", 100, 800, 9143007]}, {"0": ["vertical_and_slash", 1000, 4096, 301018], "1": ["vertical_and_slash", 3500, 100, 1784828], "2": ["vertical_and_slash", 3500, 100, 7055406], "3": ["vertical_and_slash", 3500, 100, 2086934], "4": ["vertical_and_slash", 1000, 4096, 4101320], "5": ["vertical_and_slash", 1000, 4096, 1042376], "6": ["vertical_and_slash", 3500, 100, 16976048], "7": ["vertical_and_slash", 500, 700, 1459641], "8": ["vertical_and_slash", 3500, 100, 1180323], "9": ["vertical_and_slash", 3500, 100, 21763195], "10": ["vertical_and_slash", 3500, 100, 5825008], "11": ["vertical_and_slash", 100, 800, 53453], "12": ["vertical_and_slash", 3500, 100, 11794796], "13": ["vertical_and_slash", 3500, 100, 1783957], "14": ["vertical_and_slash", 100, 800, 1440345], "15": ["vertical_and_slash", 100, 800, 16828397], "16": ["vertical_and_slash", 100, 800, 2469338], "17": ["vertical_and_slash", 100, 800, 4665593], "18": ["vertical_and_slash", 3500, 100, 10580848], "19": ["vertical_and_slash", 3500, 100, 19252331], "20": ["vertical_and_slash", 3500, 100, 20024825], "21": ["vertical_and_slash", 100, 800, 14850871], "22": ["vertical_and_slash", 3500, 100, 12678003], "23": ["vertical_and_slash", 100, 800, 1782447], "24": ["vertical_and_slash", 1000, 4096, 13287971], "25": ["vertical_and_slash", 3500, 100, 1097488], "26": ["vertical_and_slash", 1000, 4096, 2633009], "27": ["vertical_and_slash", 3500, 100, 1055757], "28": ["vertical_and_slash", 3500, 100, 742496], "29": ["vertical_and_slash", 1000, 4096, 4194904], "30": ["vertical_and_slash", 3500, 100, 1577446], "31": ["vertical_and_slash", 1000, 4096, 10526781]}, {"0": ["vertical_and_slash", 1000, 4096, 12079479], "1": ["vertical_and_slash", 3500, 100, 19962962], "2": ["vertical_and_slash", 1000, 4096, 12450062], "3": ["vertical_and_slash", 1000, 4096, 10400447], "4": ["vertical_and_slash", 100, 800, 11323650], "5": ["vertical_and_slash", 1000, 4096, 4102038], "6": ["vertical_and_slash", 1000, 4096, 3338557], "7": ["vertical_and_slash", 3500, 100, 9984816], "8": ["vertical_and_slash", 100, 800, 14524592], "9": ["vertical_and_slash", 100, 800, 2065326], "10": ["vertical_and_slash", 30, 800, 4596708], "11": ["vertical_and_slash", 500, 700, 10708236], "12": ["vertical_and_slash", 500, 700, 13397191], "13": ["vertical_and_slash", 500, 700, 1011260], "14": ["vertical_and_slash", 1000, 4096, 13165340], "15": ["vertical_and_slash", 1000, 4096, 825692], "16": ["vertical_and_slash", 3500, 100, 2810461], "17": ["vertical_and_slash", 3500, 100, 19569698], "18": ["vertical_and_slash", 3500, 100, 2251981], "19": ["vertical_and_slash", 500, 700, 5559642], "20": ["vertical_and_slash", 3500, 100, 1522515], "21": ["vertical_and_slash", 1000, 4096, 982286], "22": ["vertical_and_slash", 1000, 4096, 2085881], "23": ["vertical_and_slash", 100, 800, 2055023], "24": ["vertical_and_slash", 1000, 4096, 1242380], "25": ["vertical_and_slash", 3500, 100, 1869920], "26": ["vertical_and_slash", 3500, 100, 12180284], "27": ["vertical_and_slash", 3500, 100, 14622044], "28": ["vertical_and_slash", 1000, 4096, 557560], "29": ["vertical_and_slash", 1000, 4096, 6987039], "30": ["vertical_and_slash", 100, 800, 15769951], "31": ["vertical_and_slash", 100, 800, 7721569]}, {"0": ["vertical_and_slash", 500, 700, 4382254], "1": ["vertical_and_slash", 3500, 100, 84219], "2": ["vertical_and_slash", 500, 700, 4734463], "3": ["vertical_and_slash", 500, 700, 3186548], "4": ["vertical_and_slash", 1000, 4096, 4063246], "5": ["vertical_and_slash", 1000, 4096, 12708225], "6": ["vertical_and_slash", 500, 700, 7742943], "7": ["vertical_and_slash", 100, 800, 15424159], "8": ["vertical_and_slash", 1000, 4096, 6301506], "9": ["vertical_and_slash", 1000, 4096, 2079847], "10": ["vertical_and_slash", 1000, 4096, 4217027], "11": ["vertical_and_slash", 1000, 4096, 6297884], "12": ["vertical_and_slash", 3500, 100, 4824003], "13": ["vertical_and_slash", 1000, 4096, 3960801], "14": ["vertical_and_slash", 1000, 4096, 10405673], "15": ["vertical_and_slash", 1000, 4096, 8272702], "16": ["vertical_and_slash", 3500, 100, 2874719], "17": ["vertical_and_slash", 1000, 4096, 13248253], "18": ["vertical_and_slash", 3500, 100, 16731069], "19": ["vertical_and_slash", 1000, 4096, 3488474], "20": ["vertical_and_slash", 3500, 100, 4911794], "21": ["vertical_and_slash", 3500, 100, 3300649], "22": ["vertical_and_slash", 3500, 100, 2239972], "23": ["vertical_and_slash", 1000, 4096, 847410], "24": ["vertical_and_slash", 1000, 4096, 12556756], "25": ["vertical_and_slash", 3500, 100, 10893823], "26": ["vertical_and_slash", 1000, 4096, 14168165], "27": ["vertical_and_slash", 1000, 4096, 14127548], "28": ["vertical_and_slash", 1000, 4096, 5277617], "29": ["vertical_and_slash", 1000, 4096, 16652651], "30": ["vertical_and_slash", 1000, 4096, 7991739], "31": ["vertical_and_slash", 3500, 100, 16136482]}, {"0": ["vertical_and_slash", 100, 800, 3776409], "1": ["vertical_and_slash", 100, 800, 3972530], "2": ["vertical_and_slash", 100, 800, 10166976], "3": ["vertical_and_slash", 100, 800, 13449519], "4": ["vertical_and_slash", 30, 800, 4621777], "5": ["vertical_and_slash", 30, 800, 17026761], "6": ["vertical_and_slash", 30, 800, 11401344], "7": ["vertical_and_slash", 100, 800, 3178997], "8": ["vertical_and_slash", 1000, 4096, 14919677], "9": ["vertical_and_slash", 100, 800, 13489170], "10": ["vertical_and_slash", 1000, 4096, 12483196], "11": ["vertical_and_slash", 1000, 4096, 18647183], "12": ["vertical_and_slash", 1000, 4096, 18488628], "13": ["vertical_and_slash", 3500, 100, 18285318], "14": ["vertical_and_slash", 3500, 100, 19771087], "15": ["vertical_and_slash", 100, 800, 11952058], "16": ["vertical_and_slash", 1000, 4096, 671303], "17": ["vertical_and_slash", 3500, 100, 20413410], "18": ["vertical_and_slash", 1000, 4096, 693843], "19": ["vertical_and_slash", 3500, 100, 20183012], "20": ["vertical_and_slash", 3500, 100, 4751982], "21": ["vertical_and_slash", 1000, 4096, 1190840], "22": ["vertical_and_slash", 3500, 100, 8189368], "23": ["vertical_and_slash", 3500, 100, 4191516], "24": ["vertical_and_slash", 100, 800, 9072597], "25": ["vertical_and_slash", 3500, 100, 6214053], "26": ["vertical_and_slash", 1000, 4096, 8848124], "27": ["vertical_and_slash", 3500, 100, 1231805], "28": ["vertical_and_slash", 3500, 100, 3468573], "29": ["vertical_and_slash", 3500, 100, 16841594], "30": ["vertical_and_slash", 3500, 100, 12565098], "31": ["vertical_and_slash", 3500, 100, 4308210]}, {"0": ["vertical_and_slash", 100, 800, 405030], "1": ["vertical_and_slash", 3500, 100, 12737242], "2": ["vertical_and_slash", 1000, 4096, 6996254], "3": ["vertical_and_slash", 3500, 100, 4831216], "4": ["vertical_and_slash", 3500, 100, 5890590], "5": ["vertical_and_slash", 1000, 4096, 3008671], "6": ["vertical_and_slash", 1000, 4096, 4998230], "7": ["vertical_and_slash", 1000, 4096, 6509194], "8": ["vertical_and_slash", 3500, 100, 1774041], "9": ["vertical_and_slash", 3500, 100, 1372562], "10": ["vertical_and_slash", 3500, 100, 9111804], "11": ["vertical_and_slash", 1000, 4096, 1109182], "12": ["vertical_and_slash", 100, 800, 371771], "13": ["vertical_and_slash", 3500, 100, 905824], "14": ["vertical_and_slash", 1000, 4096, 4934535], "15": ["vertical_and_slash", 1000, 4096, 2841896], "16": ["vertical_and_slash", 3500, 100, 4614245], "17": ["vertical_and_slash", 3500, 100, 6900617], "18": ["vertical_and_slash", 3500, 100, 2824788], "19": ["vertical_and_slash", 100, 800, 6589423], "20": ["vertical_and_slash", 500, 700, 6357101], "21": ["vertical_and_slash", 30, 800, 5731632], "22": ["vertical_and_slash", 30, 800, 7261064], "23": ["vertical_and_slash", 500, 700, 9172114], "24": ["vertical_and_slash", 1000, 4096, 210349], "25": ["vertical_and_slash", 1000, 4096, 4526369], "26": ["vertical_and_slash", 1000, 4096, 2326769], "27": ["vertical_and_slash", 3500, 100, 5989844], "28": ["vertical_and_slash", 3500, 100, 1393004], "29": ["vertical_and_slash", 3500, 100, 2114704], "30": ["vertical_and_slash", 3500, 100, 776564], "31": ["vertical_and_slash", 3500, 100, 2826514]}, {"0": ["vertical_and_slash", 1000, 4096, 4747927], "1": ["vertical_and_slash", 3500, 100, 14468785], "2": ["vertical_and_slash", 3500, 100, 10124003], "3": ["vertical_and_slash", 3500, 100, 6702061], "4": ["vertical_and_slash", 1000, 4096, 2311190], "5": ["vertical_and_slash", 1000, 4096, 2412642], "6": ["vertical_and_slash", 1000, 4096, 2782532], "7": ["vertical_and_slash", 3500, 100, 6699063], "8": ["vertical_and_slash", 100, 800, 10899273], "9": ["vertical_and_slash", 100, 800, 571205], "10": ["vertical_and_slash", 1000, 4096, 2224039], "11": ["vertical_and_slash", 3500, 100, 5206481], "12": ["vertical_and_slash", 100, 800, 6039530], "13": ["vertical_and_slash", 3500, 100, 6121024], "14": ["vertical_and_slash", 1000, 4096, 915849], "15": ["vertical_and_slash", 3500, 100, 4393793], "16": ["vertical_and_slash", 1000, 4096, 4168491], "17": ["vertical_and_slash", 3500, 100, 5568206], "18": ["vertical_and_slash", 1000, 4096, 1087118], "19": ["vertical_and_slash", 1000, 4096, 2691708], "20": ["vertical_and_slash", 3500, 100, 4351677], "21": ["vertical_and_slash", 3500, 100, 3933999], "22": ["vertical_and_slash", 3500, 100, 3997663], "23": ["vertical_and_slash", 3500, 100, 3522236], "24": ["vertical_and_slash", 3500, 100, 9956224], "25": ["vertical_and_slash", 3500, 100, 4192895], "26": ["vertical_and_slash", 3500, 100, 9150842], "27": ["vertical_and_slash", 3500, 100, 12754903], "28": ["vertical_and_slash", 3500, 100, 7346979], "29": ["vertical_and_slash", 100, 800, 9422285], "30": ["vertical_and_slash", 100, 800, 3140769], "31": ["vertical_and_slash", 500, 700, 2415994]}, {"0": ["vertical_and_slash", 3500, 100, 4352921], "1": ["vertical_and_slash", 1000, 4096, 3398326], "2": ["vertical_and_slash", 3500, 100, 5788760], "3": ["vertical_and_slash", 1000, 4096, 2945608], "4": ["vertical_and_slash", 3500, 100, 1988612], "5": ["vertical_and_slash", 1000, 4096, 3736165], "6": ["vertical_and_slash", 1000, 4096, 9670660], "7": ["vertical_and_slash", 3500, 100, 3803388], "8": ["vertical_and_slash", 3500, 100, 3612542], "9": ["vertical_and_slash", 3500, 100, 4948698], "10": ["vertical_and_slash", 3500, 100, 4880140], "11": ["vertical_and_slash", 3500, 100, 2083345], "12": ["vertical_and_slash", 3500, 100, 4683160], "13": ["vertical_and_slash", 3500, 100, 3650326], "14": ["vertical_and_slash", 3500, 100, 1071456], "15": ["vertical_and_slash", 1000, 4096, 3490570], "16": ["vertical_and_slash", 1000, 4096, 1082160], "17": ["vertical_and_slash", 3500, 100, 6888781], "18": ["vertical_and_slash", 1000, 4096, 2664476], "19": ["vertical_and_slash", 3500, 100, 2759933], "20": ["vertical_and_slash", 100, 800, 653736], "21": ["vertical_and_slash", 3500, 100, 9517662], "22": ["vertical_and_slash", 3500, 100, 3973048], "23": ["vertical_and_slash", 3500, 100, 5761264], "24": ["vertical_and_slash", 3500, 100, 13615692], "25": ["vertical_and_slash", 1000, 4096, 5235320], "26": ["vertical_and_slash", 3500, 100, 10009513], "27": ["vertical_and_slash", 1000, 4096, 2682717], "28": ["vertical_and_slash", 3500, 100, 11382630], "29": ["vertical_and_slash", 3500, 100, 3802301], "30": ["vertical_and_slash", 1000, 4096, 3025864], "31": ["vertical_and_slash", 1000, 4096, 1725752]}, {"0": ["vertical_and_slash", 1000, 4096, 12877084], "1": ["vertical_and_slash", 1000, 4096, 11642564], "2": ["vertical_and_slash", 1000, 4096, 10978654], "3": ["vertical_and_slash", 3500, 100, 14674762], "4": ["vertical_and_slash", 1000, 4096, 8335239], "5": ["vertical_and_slash", 1000, 4096, 11808042], "6": ["vertical_and_slash", 1000, 4096, 10213550], "7": ["vertical_and_slash", 3500, 100, 14957853], "8": ["vertical_and_slash", 500, 700, 19867441], "9": ["vertical_and_slash", 100, 800, 10566603], "10": ["vertical_and_slash", 3500, 100, 19670449], "11": ["vertical_and_slash", 1000, 4096, 12608408], "12": ["vertical_and_slash", 3500, 100, 19432490], "13": ["vertical_and_slash", 3500, 100, 21127812], "14": ["vertical_and_slash", 3500, 100, 16648204], "15": ["vertical_and_slash", 1000, 4096, 10819630], "16": ["vertical_and_slash", 3500, 100, 5741199], "17": ["vertical_and_slash", 3500, 100, 2265976], "18": ["vertical_and_slash", 1000, 4096, 1571848], "19": ["vertical_and_slash", 3500, 100, 12168656], "20": ["vertical_and_slash", 3500, 100, 12687129], "21": ["vertical_and_slash", 1000, 4096, 4052254], "22": ["vertical_and_slash", 3500, 100, 9260206], "23": ["vertical_and_slash", 1000, 4096, 4467273], "24": ["vertical_and_slash", 100, 800, 17813181], "25": ["vertical_and_slash", 3500, 100, 21532596], "26": ["vertical_and_slash", 1000, 4096, 14291589], "27": ["vertical_and_slash", 1000, 4096, 17941032], "28": ["vertical_and_slash", 1000, 4096, 20269858], "29": ["vertical_and_slash", 100, 800, 16481898], "30": ["vertical_and_slash", 100, 800, 14035138], "31": ["vertical_and_slash", 3500, 100, 5218579]}, {"0": ["vertical_and_slash", 1000, 4096, 15472775], "1": ["vertical_and_slash", 500, 700, 16487444], "2": ["vertical_and_slash", 1000, 4096, 13062108], "3": ["vertical_and_slash", 1000, 4096, 17155780], "4": ["vertical_and_slash", 1000, 4096, 9528835], "5": ["vertical_and_slash", 1000, 4096, 18482684], "6": ["vertical_and_slash", 1000, 4096, 17086801], "7": ["vertical_and_slash", 100, 800, 16495168], "8": ["vertical_and_slash", 1000, 4096, 6931295], "9": ["vertical_and_slash", 3500, 100, 21960054], "10": ["vertical_and_slash", 1000, 4096, 13941150], "11": ["vertical_and_slash", 3500, 100, 6249722], "12": ["vertical_and_slash", 1000, 4096, 12292065], "13": ["vertical_and_slash", 3500, 100, 14056066], "14": ["vertical_and_slash", 1000, 4096, 17988711], "15": ["vertical_and_slash", 3500, 100, 13838932], "16": ["vertical_and_slash", 3500, 100, 11542474], "17": ["vertical_and_slash", 1000, 4096, 10272174], "18": ["vertical_and_slash", 3500, 100, 10106952], "19": ["vertical_and_slash", 1000, 4096, 11953729], "20": ["vertical_and_slash", 1000, 4096, 12125335], "21": ["vertical_and_slash", 1000, 4096, 5421557], "22": ["vertical_and_slash", 1000, 4096, 17046156], "23": ["vertical_and_slash", 1000, 4096, 13763363], "24": ["vertical_and_slash", 1000, 4096, 14971340], "25": ["vertical_and_slash", 1000, 4096, 13949429], "26": ["vertical_and_slash", 1000, 4096, 13427580], "27": ["vertical_and_slash", 1000, 4096, 12712355], "28": ["vertical_and_slash", 1000, 4096, 10262417], "29": ["vertical_and_slash", 1000, 4096, 14593517], "30": ["vertical_and_slash", 3500, 100, 19020287], "31": ["vertical_and_slash", 1000, 4096, 16309396]}, {"0": ["vertical_and_slash", 100, 800, 6402139], "1": ["vertical_and_slash", 500, 700, 8580595], "2": ["vertical_and_slash", 3500, 100, 6974040], "3": ["vertical_and_slash", 500, 700, 9230357], "4": ["vertical_and_slash", 500, 700, 1458178], "5": ["vertical_and_slash", 3500, 100, 12626929], "6": ["vertical_and_slash", 500, 700, 7367522], "7": ["vertical_and_slash", 30, 800, 16753754], "8": ["vertical_and_slash", 100, 800, 16185443], "9": ["vertical_and_slash", 30, 800, 13212259], "10": ["vertical_and_slash", 30, 800, 16869582], "11": ["vertical_and_slash", 100, 800, 8982160], "12": ["vertical_and_slash", 3500, 100, 15101824], "13": ["vertical_and_slash", 500, 700, 10028751], "14": ["vertical_and_slash", 30, 800, 18999889], "15": ["vertical_and_slash", 100, 800, 15535188], "16": ["vertical_and_slash", 1000, 4096, 3376934], "17": ["vertical_and_slash", 1000, 4096, 3838435], "18": ["vertical_and_slash", 1000, 4096, 2789787], "19": ["vertical_and_slash", 1000, 4096, 9668519], "20": ["vertical_and_slash", 500, 700, 16137894], "21": ["vertical_and_slash", 1000, 4096, 3380197], "22": ["vertical_and_slash", 500, 700, 6788616], "23": ["vertical_and_slash", 1000, 4096, 4978497], "24": ["vertical_and_slash", 3500, 100, 9896749], "25": ["vertical_and_slash", 500, 700, 20982412], "26": ["vertical_and_slash", 1000, 4096, 5738438], "27": ["vertical_and_slash", 1000, 4096, 14533987], "28": ["vertical_and_slash", 3500, 100, 11385648], "29": ["vertical_and_slash", 30, 800, 11091461], "30": ["vertical_and_slash", 1000, 4096, 7801211], "31": ["vertical_and_slash", 1000, 4096, 12946499]}, {"0": ["vertical_and_slash", 1000, 4096, 8005141], "1": ["vertical_and_slash", 30, 800, 9683398], "2": ["vertical_and_slash", 100, 800, 15684848], "3": ["vertical_and_slash", 30, 800, 10783581], "4": ["vertical_and_slash", 30, 800, 12674711], "5": ["vertical_and_slash", 100, 800, 17627426], "6": ["vertical_and_slash", 500, 700, 6603740], "7": ["vertical_and_slash", 30, 800, 8037793], "8": ["vertical_and_slash", 1000, 4096, 18603355], "9": ["vertical_and_slash", 100, 800, 18175297], "10": ["vertical_and_slash", 1000, 4096, 15415235], "11": ["vertical_and_slash", 100, 800, 8188133], "12": ["vertical_and_slash", 100, 800, 16790430], "13": ["vertical_and_slash", 1000, 4096, 4440951], "14": ["vertical_and_slash", 1000, 4096, 12155674], "15": ["vertical_and_slash", 3500, 100, 18728501], "16": ["vertical_and_slash", 30, 800, 8282869], "17": ["vertical_and_slash", 30, 800, 18611641], "18": ["vertical_and_slash", 30, 800, 7125529], "19": ["vertical_and_slash", 30, 800, 9867525], "20": ["vertical_and_slash", 100, 800, 8121064], "21": ["vertical_and_slash", 100, 800, 8406786], "22": ["vertical_and_slash", 30, 800, 11020990], "23": ["vertical_and_slash", 30, 800, 4944682], "24": ["vertical_and_slash", 30, 800, 16714152], "25": ["vertical_and_slash", 30, 800, 9194588], "26": ["vertical_and_slash", 500, 700, 9003731], "27": ["vertical_and_slash", 1000, 4096, 6939820], "28": ["vertical_and_slash", 500, 700, 10839557], "29": ["vertical_and_slash", 500, 700, 14432584], "30": ["vertical_and_slash", 100, 800, 12363347], "31": ["vertical_and_slash", 30, 800, 14465663]}]

minference/configs/model2path.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+MODEL2PATH = {
+    "gradientai/Llama-3-8B-Instruct-262k": os.path.join(
+        BASE_DIR, "Llama_3_8B_Instruct_262k_kv_out_v32_fit_o_best_pattern.json"
+    ),
+    "gradientai/Llama-3-8B-Instruct-Gradient-1048k": os.path.join(
+        BASE_DIR, "Llama_3_8B_Instruct_262k_kv_out_v32_fit_o_best_pattern.json"
+    ),
+    "01-ai/Yi-9B-200K": os.path.join(
+        BASE_DIR, "Yi_9B_200k_kv_out_v32_fit_o_best_pattern.json"
+    ),
+    "microsoft/Phi-3-mini-128k-instruct": os.path.join(
+        BASE_DIR, "Phi_3_mini_128k_instruct_kv_out_v32_fit_o_best_pattern.json"
+    ),
+}

minference/minference_configuration.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import os
+from .configs.model2path import MODEL2PATH
+class MInferenceConfig:
+    ATTENTION_TYPES = [
+        "minference",
+        "minference_with_dense",
+        "static",
+        "dilated1",
+        "dilated2",
+        "streaming",
+        "inf_llm",
+        "vllm",
+    ]
+    def __init__(
+        self,
+        attn_type: str = "minference",
+        model_name: str = None,
+        config_path: str = None,
+        starting_layer: int = -1,
+        kv_cache_cpu: bool = False,
+        use_snapkv: bool = False,
+        is_search: bool = False,
+        attn_kwargs: dict = {},
+        **kwargs,
+    ):
+        super(MInferenceConfig, self).__init__()
+        assert (
+            attn_type in self.ATTENTION_TYPES
+        ), f"The attention_type {attn_type} you specified is not supported."
+        self.attn_type = attn_type
+        self.config_path = self.update_config_path(config_path, model_name)
+        self.model_name = model_name
+        self.is_search = is_search
+        self.starting_layer = starting_layer
+        self.kv_cache_cpu = kv_cache_cpu
+        self.use_snapkv = use_snapkv
+        self.attn_kwargs = attn_kwargs
+    def update_config_path(self, config_path: str, model_name: str):
+        if config_path is not None:
+            return config_path
+        assert (
+            model_name in MODEL2PATH
+        ), f"The model {model_name} you specified is not supported. You are welcome to add it and open a PR :)"
+        return MODEL2PATH[model_name]

minference/models_patch.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import os
+from .minference_configuration import MInferenceConfig
+from .patch import minference_patch, minference_patch_vllm, patch_hf
+class MInference:
+    def __init__(
+        self,
+        attn_type: str = "minference",
+        model_name: str = None,
+        config_path: str = None,
+        starting_layer: int = -1,
+        kv_cache_cpu: bool = False,
+        use_snapkv: bool = False,
+        is_search: bool = False,
+        attn_kwargs: dict = {},
+        **kwargs,
+    ):
+        super(MInference, self).__init__()
+        self.config = MInferenceConfig(
+            attn_type=attn_type,
+            model_name=model_name,
+            config_path=config_path,
+            starting_layer=starting_layer,
+            kv_cache_cpu=kv_cache_cpu,
+            use_snapkv=use_snapkv,
+            is_search=is_search,
+            attn_kwargs=attn_kwargs,
+            **kwargs,
+        )
+    def __call__(self, model):
+        return self.patch_model(model)
+    def patch_model(self, model):
+        if self.config.attn_type != "vllm":
+            model.config.starting_layer = self.config.starting_layer
+            model.config.config_path = self.config.config_path
+        if self.config.attn_type == "minference":
+            model.config.is_search = self.config.is_search
+            model = minference_patch(model, self.config)
+        elif self.config.attn_type == "minference_with_dense":
+            model.config.dense = True
+            model = minference_patch(model, self.config)
+        elif self.config.attn_type == "dilated1":
+            model.config.dilated1 = True
+            model = minference_patch(model, self.config)
+        elif self.config.attn_type == "static":
+            model.config.static_pattern = True
+            model = minference_patch(model, self.config)
+        elif self.config.attn_type == "dilated2":
+            model.config.dilated2 = True
+            model = minference_patch(model, self.config)
+        elif self.config.attn_type == "streaming":
+            model.config.streaming = True
+            model.config.streaming_kwargs = {
+                "n_local": 3968,
+                "n_init": 128,
+                **self.config.attn_kwargs,
+            }
+            model = minference_patch(model, self.config)
+        elif self.config.attn_type == "streaming2":
+            model = patch_hf(
+                model,
+                attn_type="streaming",
+                attn_kwargs={"n_local": 3968, "n_init": 128, **self.config.attn_kwargs},
+            )
+        elif self.config.attn_type == "inf_llm":
+            model = patch_hf(
+                model,
+                attn_type="inf_llm",
+                attn_kwargs={
+                    "block_size": 128,
+                    "n_init": 128,
+                    "n_local": 4096,
+                    "topk": 16,
+                    "repr_topk": 4,
+                    "max_cached_block": 32,
+                    "exc_block_size": 512,
+                    "base": 1000000,
+                    "distance_scale": 1.0,
+                    "dense_decoding": True,
+                    **self.config.attn_kwargs,
+                },
+            )
+        elif self.config.attn_type == "vllm":
+            model = minference_patch_vllm(model, self.config.config_path)
+        else:
+            raise ValueError(
+                f"The attention type {self.config.attn_type} you specified is not supported."
+            )
+        return model

minference/modules/inf_llm.py ADDED Viewed

	@@ -0,0 +1,1296 @@

+from copy import deepcopy
+from typing import Optional, Tuple
+import torch
+from flash_attn import flash_attn_func
+from transformers.modeling_outputs import CausalLMOutput
+from ..ops.streaming_kernel import TritonMultiStageDotProductionAttention
+class CudaCache:
+    def __init__(self, num_units, unit_size, dtype):
+        self.num_units = num_units
+        self.unit_size = unit_size
+        self.dtype = dtype
+        self.data = torch.empty((num_units, unit_size), device="cuda", dtype=dtype)
+        self.idle_set = set(list(range(num_units)))
+    def alloc(self):
+        assert len(self.idle_set) > 0
+        idx = self.idle_set.pop()
+        return self.data[idx], idx
+    def delete(self, idx):
+        assert idx not in self.idle_set
+        self.idle_set.add(idx)
+class MemoryUnit:
+    def __init__(
+        self,
+        kv: Tuple[torch.Tensor, torch.Tensor],
+        cache: CudaCache,
+        load_to_cache: bool = False,
+        pin_memory: bool = False,
+    ):
+        self.cache = cache
+        if kv[0].is_cuda:
+            cpu_data = tuple(_t.contiguous().to("cpu", non_blocking=True) for _t in kv)
+        else:
+            cpu_data = tuple(_t.contiguous() for _t in kv)
+        if pin_memory:
+            cpu_data = tuple(_t.pin_memory() for _t in cpu_data)
+        if load_to_cache:
+            gpu_data, gpu_data_id = cache.alloc()
+            gpu_data = gpu_data.view((2,) + kv[0].shape)
+            gpu_data[0].copy_(kv[0], non_blocking=True)
+            gpu_data[1].copy_(kv[1], non_blocking=True)
+            event = torch.cuda.Event()
+            event.record(torch.cuda.current_stream())
+        else:
+            gpu_data, gpu_data_id = None, None
+            event = None
+        self.cpu_data = cpu_data
+        self.gpu_data = gpu_data
+        self.gpu_data_id = gpu_data_id
+        self.event = event
+    def load(self, target: Optional[Tuple[torch.Tensor, torch.Tensor]] = None) -> bool:
+        if self.gpu_data is not None:
+            if target is not None:
+                target[0].copy_(self.gpu_data[0], non_blocking=True)
+                target[1].copy_(self.gpu_data[1], non_blocking=True)
+                target_event = torch.cuda.Event()
+                target_event.record(torch.cuda.current_stream())
+            else:
+                target_event = None
+            return False, target_event
+        gpu_data, gpu_data_id = self.cache.alloc()
+        gpu_data = gpu_data.view((2,) + self.cpu_data[0].shape)
+        if target is not None:
+            target[0].copy_(self.cpu_data[0], non_blocking=True)
+            target[1].copy_(self.cpu_data[1], non_blocking=True)
+            target_event = torch.cuda.Event()
+            target_event.record(torch.cuda.current_stream())
+            gpu_data[0].copy_(target[0], non_blocking=True)
+            gpu_data[1].copy_(target[1], non_blocking=True)
+        else:
+            gpu_data[0].copy_(self.cpu_data[0], non_blocking=True)
+            gpu_data[1].copy_(self.cpu_data[1], non_blocking=True)
+        event = torch.cuda.Event()
+        event.record(torch.cuda.current_stream())
+        self.event = event
+        self.gpu_data = gpu_data
+        self.gpu_data_id = gpu_data_id
+        return True, target_event
+    def get(self):
+        assert self.gpu_data is not None
+        self.event.wait()
+        return self.gpu_data
+    def offload(self):
+        assert self.gpu_data is not None
+        self.event.wait()
+        self.gpu_data = None
+        self.cache.delete(self.gpu_data_id)
+        self.gpu_data_id = None
+class VectorTensor:
+    def __init__(self, hidden_size, element_dtype):
+        init_cached_size = 16
+        self.data = torch.empty(
+            (init_cached_size, hidden_size), dtype=element_dtype, device="cuda"
+        )
+        self.length = 0
+        self.cache_size = init_cached_size
+        self.hidden_size = hidden_size
+    def append_cache(self):
+        new_cache_size = self.cache_size * 2
+        data_shape = self.data.shape
+        new_data = torch.empty(
+            (new_cache_size,) + data_shape[1:], device="cuda", dtype=self.data.dtype
+        )
+        new_data[: self.cache_size, ...].copy_(self.data)
+        self.data = new_data
+        self.cache_size = new_cache_size
+    def append(self, tensor: torch.Tensor):
+        assert tensor.dtype == self.data.dtype
+        assert tensor.size(1) == self.hidden_size
+        assert tensor.is_contiguous()
+        append_l = tensor.size(0)
+        while self.length + append_l > self.cache_size:
+            self.append_cache()
+        self.data[self.length : self.length + append_l, ...].copy_(tensor)
+        self.length += append_l
+    def get_data(self):
+        return self.data[: self.length, ...]
+    def get_topk(self, tensor: torch.Tensor, topk):  # inner product
+        assert tensor.dim() == 1 and tensor.size(0) == self.hidden_size
+        logits = torch.matmul(self.data[: self.length], tensor[:, None]).squeeze(dim=-1)
+        assert logits.dim() == 1 and logits.size(0) == self.length
+        return logits.topk(topk, dim=0).indices.cpu().tolist()
+    def __len__(self):
+        return self.length
+class Faiss:
+    def __init__(self, hidden_size, element_dtype):
+        import faiss
+        # We use the CPU index here because the GPU index requires a long initialization time
+        self.index = faiss.IndexFlatIP(hidden_size)
+        self.hidden_size = hidden_size
+    def append(self, tensor: torch.Tensor):
+        assert tensor.dim() == 2 and tensor.size(1) == self.hidden_size
+        self.index.add(tensor.cpu().float().numpy().astype("float32"))
+    def get_data(self):
+        raise ValueError
+    def get_topk(self, tensor: torch.Tensor, topk):
+        assert tensor.dim() == 1 and tensor.size(0) == self.hidden_size
+        xq = tensor[None, :].cpu().float().numpy().astype("float32")
+        topk_index = self.index.search(xq, topk)[1][0].tolist()
+        return topk_index
+    def __len__(self):
+        return self.index.ntotal
+GLOBAL_STREAM = None
+class ContextManager:
+    def __init__(
+        self,
+        position_embedding,
+        n_init,
+        n_local,
+        block_size,
+        max_cached_block,
+        topk,
+        exc_block_size,
+        score_decay: Optional[float] = None,
+        repr_topk: int = 1,
+        cache_strategy="lru",
+        chunk_topk_calc: Optional[int] = None,
+        async_global_stream: bool = False,
+        pin_memory: bool = False,
+        faiss: bool = False,
+        perhead: bool = False,
+        dense_decoding: bool = False,
+    ):
+        self.length = 0
+        self.position_embedding = position_embedding
+        self.n_init = n_init
+        self.n_local = n_local
+        self.block_size = block_size
+        self.max_cached_block = max_cached_block
+        self.exc_block_size = exc_block_size
+        self.score_decay = score_decay
+        assert exc_block_size <= n_local  # no global token in input
+        self.topk = topk
+        self.Attn = TritonMultiStageDotProductionAttention
+        self.initialized = False
+        self.repr_topk = repr_topk
+        self.cache_strategy = cache_strategy
+        self.load_count = 0
+        self.chunk_topk_calc = chunk_topk_calc
+        self.async_global_stream = async_global_stream
+        self.pin_memory = pin_memory
+        self.faiss = faiss
+        self.perhead = perhead
+        self.dense_decoding = dense_decoding
+        global GLOBAL_STREAM
+        if self.async_global_stream and GLOBAL_STREAM is None:
+            GLOBAL_STREAM = torch.cuda.Stream()
+        assert cache_strategy in ["lru", "lru-s"]
+        if cache_strategy == "lru-s":
+            self.calc_block_score = True
+        else:
+            self.calc_block_score = False
+    def remove_lru_blocks(
+        self, u, num_remove: Optional[int] = None, ignore_blocks=None
+    ):
+        if num_remove is None:
+            num_remove = len(self.cached_blocks[u]) - self.max_cached_block
+        if num_remove <= 0:
+            return
+        lst = list(self.cached_blocks[u].items())
+        lst.sort(key=lambda x: x[1])
+        removed = 0
+        for i in range(len(lst)):
+            idx = lst[i][0]
+            if ignore_blocks is None or (idx not in ignore_blocks):
+                self.global_blocks[u][idx].offload()
+                self.cached_blocks[u].pop(idx)
+                removed += 1
+            if removed >= num_remove:
+                return
+    def get_block_k(self, k, score):
+        assert isinstance(score, torch.Tensor)
+        assert k.dim() >= 2
+        k = self.from_group_kv(k)
+        assert k.shape[:-1] == score.shape
+        assert k.shape[-2] == self.block_size
+        score_topk = score.topk(self.repr_topk, dim=-1).indices
+        assert score_topk.shape == (self.num_units, self.unit_size, self.repr_topk)
+        ret = torch.gather(
+            k,
+            -2,
+            score_topk[:, :, :, None].expand(
+                self.num_units, self.unit_size, self.repr_topk, self.dim_head
+            ),
+        )
+        return ret
+    def from_group_kv(self, tensor):
+        assert tensor.dim() == 4
+        assert tensor.size(1) == self.num_heads_kv
+        if self.num_heads == self.num_heads_kv:
+            return tensor
+        _, _, length, dim_head = tensor.shape
+        num_group = self.num_heads // self.num_heads_kv
+        tensor = tensor.view((self.num_units, self.unit_size_kv, 1, length, dim_head))
+        tensor = tensor.expand(
+            (self.num_units, self.unit_size_kv, num_group, length, dim_head)
+        ).reshape((self.num_units, self.num_heads, length, dim_head))
+        return tensor
+    def init(self, local_q, local_k, local_v, global_q, global_k, global_v):
+        assert local_q.dim() == 4
+        batch_size, num_heads, len_q, dim_head = local_q.shape
+        num_heads_kv = local_k.size(1)
+        for _t in [local_q, local_k, local_v, global_q, global_k, global_v]:
+            assert _t.size(0) == batch_size
+            assert _t.size(1) == num_heads or _t.size(1) == num_heads_kv
+            assert _t.size(2) == len_q
+            assert _t.size(3) == dim_head
+            assert _t.is_cuda
+        self.batch_size = batch_size
+        self.num_heads = num_heads
+        self.num_heads_kv = num_heads_kv
+        self.dim_head = dim_head
+        self.num_units = batch_size
+        self.unit_size = num_heads
+        self.unit_size_kv = num_heads_kv
+        self.global_blocks = [[] for _ in range(self.num_units)]  # [[memory_unit]]
+        self.cached_blocks = [
+            {} for _ in range(self.num_units)
+        ]  # [[block_id: block_score]
+        self.num_global_block = 0
+        if self.faiss:
+            self.block_k = [
+                Faiss(dim_head * self.unit_size, global_k.dtype)
+                for _ in range(self.num_units)
+            ]
+        else:
+            self.block_k = [
+                VectorTensor(dim_head * self.unit_size, global_k.dtype)
+                for _ in range(self.num_units)
+            ]
+        self.local_k = torch.empty(
+            (self.num_units, self.unit_size_kv, 0, dim_head),
+            dtype=local_k.dtype,
+            device=local_k.device,
+        )
+        self.local_v = torch.empty(
+            (self.num_units, self.unit_size_kv, 0, dim_head),
+            dtype=local_v.dtype,
+            device=local_v.device,
+        )
+        if self.dense_decoding:
+            self.dense_k = torch.empty(
+                (self.num_units, self.unit_size_kv, 0, dim_head),
+                dtype=local_k.dtype,
+                device=local_k.device,
+            )
+            self.dense_v = torch.empty(
+                (self.num_units, self.unit_size_kv, 0, dim_head),
+                dtype=local_v.dtype,
+                device=local_v.device,
+            )
+        self.global_remainder = (
+            torch.empty(
+                (self.num_units, self.unit_size_kv, 0, dim_head),
+                dtype=global_k.dtype,
+                device=global_k.device,
+            ),
+            torch.empty(
+                (self.num_units, self.unit_size_kv, 0, dim_head),
+                dtype=global_v.dtype,
+                device=global_v.device,
+            ),
+        )
+        self.global_remainder_local_score = torch.empty(
+            (self.num_units, self.unit_size, 0),
+            dtype=global_k.dtype,
+            device=global_k.device,
+        )
+        self.init_k = torch.empty(
+            (self.num_units, self.unit_size_kv, 0, dim_head),
+            dtype=global_k.dtype,
+            device=global_k.device,
+        )
+        self.init_v = torch.empty(
+            (self.num_units, self.unit_size_kv, 0, dim_head),
+            dtype=global_k.dtype,
+            device=global_k.device,
+        )
+        self.init_exc = False
+        self.dtype = local_q.dtype
+        self.position_embedding._update_cos_sin_tables_len(
+            self.n_local + self.exc_block_size + 1, local_k.device, local_k.dim()
+        )
+        buffer_len = (
+            self.topk * self.block_size
+            + self.exc_block_size
+            + self.block_size
+            + self.n_init
+        )
+        self.global_buffer = torch.zeros(
+            (2, self.num_units, self.unit_size_kv, buffer_len, dim_head),
+            dtype=global_k.dtype,
+            device=global_k.device,
+        )
+        self.global_buffer_block_id_list = [
+            [-1] * self.topk for _ in range(self.num_units)
+        ]
+        self.global_buffer_init_st = 0
+        self.global_buffer_init_ed = 0
+        self.cuda_cache = CudaCache(
+            self.max_cached_block * self.num_units,
+            self.unit_size_kv * self.block_size * dim_head * 2,
+            local_k.dtype,
+        )
+        self.initialized = True
+    def calc_block_topk(self, global_h_q):
+        if not self._use_chunk_topk:
+            if self.num_global_block <= self.topk:
+                return [
+                    list(range(len(self.global_blocks[0])))
+                    for _ in range(self.num_units)
+                ]
+            global_h_q = global_h_q.mean(dim=2, keepdim=False)
+            assert global_h_q.shape == (self.num_units, self.unit_size, self.dim_head)
+            global_h_q = global_h_q.reshape(
+                self.num_units, self.dim_head * self.unit_size
+            )
+            ret = []
+            for u in range(self.num_units):
+                ret.append(self.block_k[u].get_topk(global_h_q[u], self.topk))
+        else:
+            return self._cached_topk[self._topk_cur]
+        return ret
+    def get_global_hidden_and_mask(self, len_q, block_topk):
+        assert len(block_topk) == self.num_units
+        global_block_map = [[] for _ in range(self.num_units)]
+        global_remainder_len = max(
+            self._global_remainder_ed
+            - self._global_remainder_st
+            + len_q
+            - self.n_local,
+            0,
+        )
+        init_len = self.init_k.size(-2)
+        sliding_window = None
+        global_h_k = self.global_buffer[0]
+        global_h_v = self.global_buffer[1]
+        block_num = len(block_topk[0])
+        for u in range(self.num_units):
+            assert len(block_topk[u]) == block_num
+            block_topk[u].sort()
+            global_block_map[u] = deepcopy(self.global_buffer_block_id_list[u])
+            for b_idx in block_topk[u]:
+                if b_idx in global_block_map[u]:
+                    continue
+                st = -1
+                ed = -1
+                for j in range(self.topk):
+                    if (
+                        global_block_map[u][j] == -1
+                        or global_block_map[u][j] not in block_topk[u]
+                    ):
+                        st = j * self.block_size
+                        ed = st + self.block_size
+                        global_block_map[u][j] = b_idx
+                        break
+                assert b_idx in self.cached_blocks[u]
+                self.global_blocks[u][b_idx].load(
+                    (global_h_k[u, :, st:ed, :], global_h_v[u, :, st:ed, :])
+                )
+        init_st = block_num * self.block_size
+        init_ed = init_st + init_len
+        if (
+            self.global_buffer_init_st != init_st
+            or self.global_buffer_init_ed != init_ed
+        ):
+            global_h_k[:, :, init_st:init_ed, :].copy_(self.init_k, non_blocking=True)
+            global_h_v[:, :, init_st:init_ed, :].copy_(self.init_v, non_blocking=True)
+        ed = init_ed
+        rmd_st = init_ed
+        rmd_ed = rmd_st + global_remainder_len
+        ed = rmd_ed
+        global_h_k[:, :, rmd_st:rmd_ed, :].copy_(
+            self.global_remainder[0][
+                :,
+                :,
+                self._global_remainder_st : self._global_remainder_st
+                + global_remainder_len,
+                :,
+            ],
+            non_blocking=True,
+        )
+        global_h_v[:, :, rmd_st:rmd_ed, :].copy_(
+            self.global_remainder[1][
+                :,
+                :,
+                self._global_remainder_st : self._global_remainder_st
+                + global_remainder_len,
+                :,
+            ],
+            non_blocking=True,
+        )
+        sliding_window = (self.global_remainder[0].size(-2) + rmd_st, self.n_local)
+        self.global_buffer_block_id_list = deepcopy(global_block_map)
+        self.global_buffer_init_st = init_st
+        self.global_buffer_init_ed = init_ed
+        for u in range(self.num_units):
+            assert max(global_block_map[u][block_num:] + [-1]) == -1
+            assert min(global_block_map[u][:block_num] + [0]) > -1
+            global_block_map[u] = list(global_block_map[u][:block_num])
+        global_h_k = global_h_k[:, :, :ed, :]
+        global_h_v = global_h_v[:, :, :ed, :]
+        return global_h_k, global_h_v, sliding_window, global_block_map, block_num
+    def update_block_score(
+        self, global_score: torch.FloatTensor, global_block_map, global_block_num
+    ):
+        if global_score is not None:
+            global_score = global_score[:, :, : global_block_num * self.block_size]
+            assert global_score.shape == (
+                self.num_units,
+                self.unit_size,
+                global_block_num * self.block_size,
+            )
+            global_score = global_score.view(
+                self.num_units, self.unit_size, global_block_num, self.block_size
+            )
+            global_score = global_score.sum(dim=-1).sum(dim=1)
+            assert global_score.shape == (self.num_units, global_block_num)
+            global_score = global_score.to(
+                device="cpu", non_blocking=False
+            )  # (num_units, global_block_num)
+            for u in range(self.num_units):
+                for k, v in self.cached_blocks[u].items():
+                    self.cached_blocks[u][k] = v * self.score_decay
+                score = global_score[u].tolist()
+                assert len(score) >= len(global_block_map[u])
+                for s, i in zip(score, global_block_map[u]):
+                    self.cached_blocks[u][i] += s
+    def _append(self, local_q, local_k, local_v, global_q):
+        # get local_h_q, local_h_k, local_h_v
+        local_h_q, local_h_k = self.position_embedding(local_q, local_k)
+        local_h_v = local_v
+        # calc local result first to overlap host-device communication
+        attn = self.Attn(local_h_q.shape, local_h_q.dtype, local_h_q.device)
+        attn.append(
+            local_h_q, local_h_k, local_h_v, get_score=True, sliding_window=self.n_local
+        )
+        # calc topk global repr k and load cache
+        with torch.cuda.stream(GLOBAL_STREAM):
+            block_topk = self.calc_block_topk(global_q)
+            for u in range(self.num_units):
+                num_remove = len(self.cached_blocks[u]) - self.max_cached_block
+                for bidx in block_topk[u]:
+                    if bidx not in self.cached_blocks[u]:
+                        num_remove += 1
+                # update cache
+                self.remove_lru_blocks(u, num_remove, block_topk[u])
+            if self.cache_strategy == "lru":
+                self.load_count += 1
+                for u in range(self.num_units):
+                    for bidx in block_topk[u]:
+                        self.cached_blocks[u][bidx] = self.load_count
+            elif self.cache_strategy == "lru-s":
+                for u in range(self.num_units):
+                    for bidx in block_topk[u]:
+                        self.cached_blocks[u][bidx] = 0
+            else:
+                raise ValueError
+            # get global_h_k, global_h_v, global_mask
+            #    Beacuse exc_block_size <= n_local, no global_k, global_v used in global part
+            global_h_q = global_q
+            (
+                global_h_k,
+                global_h_v,
+                global_sliding_window,
+                global_block_map,
+                global_block_num,
+            ) = self.get_global_hidden_and_mask(local_h_q.size(-2), block_topk)
+        if self.async_global_stream:
+            torch.cuda.current_stream().wait_stream(GLOBAL_STREAM)
+        # calc global result
+        attn.append(
+            global_h_q,
+            global_h_k,
+            global_h_v,
+            end=True,
+            get_score=self.calc_block_score,
+            sliding_window=global_sliding_window,
+            complement_sliding_window=True,
+        )
+        o, score_list = attn.get_result()
+        loc_score = score_list[0]
+        glb_score = score_list[1]
+        if self.async_global_stream:
+            GLOBAL_STREAM.wait_stream(torch.cuda.current_stream())
+        # update global score
+        with torch.cuda.stream(GLOBAL_STREAM):
+            self.update_block_score(glb_score, global_block_map, global_block_num)
+        return o.view((self.batch_size, self.num_heads, -1, self.dim_head)), loc_score
+    def get_batched_topk(self, global_q):
+        length = global_q.shape[2]
+        exc_num = (length + self.exc_block_size - 1) // self.exc_block_size
+        exc_block_num = length // self.exc_block_size
+        ret = []
+        if self.num_global_block <= self.topk:
+            for _ in range(exc_num):
+                ret.append(
+                    [
+                        list(range(len(self.global_blocks[0])))
+                        for _ in range(self.num_units)
+                    ]
+                )
+            return ret
+        global_h_q = global_q
+        assert global_h_q.dim() == 4
+        assert global_h_q.shape[:2] == (self.num_units, self.unit_size)
+        assert global_h_q.shape[3] == self.dim_head
+        block_k = torch.cat(
+            [self.block_k[u].get_data()[None, :, :] for u in range(self.num_units)],
+            dim=0,
+        )
+        assert block_k.shape == (
+            self.num_units,
+            self.num_global_block,
+            self.dim_head * self.unit_size,
+        )
+        block_k = (
+            block_k.reshape(
+                self.num_units, self.num_global_block, self.unit_size, self.dim_head
+            )
+            .permute(0, 2, 1, 3)
+            .contiguous()
+        )
+        if exc_block_num > 0:
+            tmp_global_h_q = (
+                global_h_q[:, :, : exc_block_num * self.exc_block_size, :]
+                .reshape(
+                    self.num_units,
+                    self.unit_size,
+                    exc_block_num,
+                    self.exc_block_size,
+                    self.dim_head,
+                )
+                .mean(dim=-2)
+            )
+            assert tmp_global_h_q.shape == (
+                self.num_units,
+                self.unit_size,
+                exc_block_num,
+                self.dim_head,
+            )
+            block_score = torch.matmul(tmp_global_h_q, block_k.transpose(-1, -2)).mean(
+                dim=1
+            )  # (num_units, exc_block_num, num_global_block)
+            assert block_score.shape == (
+                self.num_units,
+                exc_block_num,
+                self.num_global_block,
+            )
+            indices = block_score.topk(self.topk, dim=-1).indices.cpu()
+            for b in range(exc_block_num):
+                tmp = []
+                for u in range(self.num_units):
+                    tmp.append(indices[u, b].tolist())
+                    assert len(tmp[-1]) == self.topk
+                ret.append(tmp)
+        if exc_block_num != exc_num:
+            tmp_global_h_q = (
+                global_h_q[:, :, exc_block_num * self.exc_block_size :, :]
+                .reshape(
+                    self.num_units,
+                    self.unit_size,
+                    length - exc_block_num * self.exc_block_size,
+                    self.dim_head,
+                )
+                .mean(dim=-2, keepdim=True)
+            )
+            assert tmp_global_h_q.shape == (
+                self.num_units,
+                self.unit_size,
+                1,
+                self.dim_head,
+            )
+            block_score = torch.matmul(tmp_global_h_q, block_k.transpose(-1, -2))
+            assert block_score.shape == (
+                self.num_units,
+                self.unit_size,
+                1,
+                self.num_global_block,
+            )
+            block_score = block_score.squeeze(dim=2).mean(dim=1)
+            assert block_score.shape == (self.num_units, self.num_global_block)
+            indices = block_score.topk(self.topk, dim=-1).indices.cpu()
+            tmp = []
+            for u in range(self.num_units):
+                tmp.append(indices[u].tolist())
+                assert len(tmp[-1]) == self.topk
+            ret.append(tmp)
+        return ret
+    def append_global(self, exc_length, kv_length, local_score):
+        global_remainder_ed = self._global_remainder_ed + exc_length
+        global_remainder_st = self._global_remainder_st
+        global_remainder_len = global_remainder_ed - global_remainder_st
+        assert local_score.shape[:3] == (self.num_units, self.unit_size, kv_length)
+        local_score = local_score[:, :, -exc_length - self.n_local :]
+        self.global_remainder_local_score[
+            :, :, global_remainder_ed - local_score.size(-1) : global_remainder_ed
+        ].add_(local_score)
+        if not self.init_exc and global_remainder_len > self.n_local:
+            global_k = self.global_remainder[0]
+            global_v = self.global_remainder[1]
+            append_init_len = min(
+                self.n_init - self.init_k.size(-2), global_remainder_len - self.n_local
+            )
+            self.init_k = torch.cat(
+                (
+                    self.init_k,
+                    global_k[
+                        :,
+                        :,
+                        global_remainder_st : global_remainder_st + append_init_len,
+                        :,
+                    ],
+                ),
+                dim=-2,
+            )
+            self.init_v = torch.cat(
+                (
+                    self.init_v,
+                    global_v[
+                        :,
+                        :,
+                        global_remainder_st : global_remainder_st + append_init_len,
+                        :,
+                    ],
+                ),
+                dim=-2,
+            )
+            global_remainder_st += append_init_len
+            global_remainder_len -= append_init_len
+            if self.init_k.size(-2) == self.n_init:
+                self.init_exc = True
+        while global_remainder_len - self.block_size >= self.n_local:
+            global_remainder_len -= self.block_size
+            for u in range(self.num_units):
+                self.global_blocks[u].append(
+                    (
+                        MemoryUnit(
+                            (
+                                self.global_remainder[0][
+                                    u,
+                                    :,
+                                    global_remainder_st : global_remainder_st
+                                    + self.block_size,
+                                    :,
+                                ],
+                                self.global_remainder[1][
+                                    u,
+                                    :,
+                                    global_remainder_st : global_remainder_st
+                                    + self.block_size,
+                                    :,
+                                ],
+                            ),
+                            self.cuda_cache,
+                            False,
+                            self.pin_memory,
+                        )
+                    )
+                )
+            global_block_k = self.get_block_k(
+                self.global_remainder[0][
+                    :, :, global_remainder_st : global_remainder_st + self.block_size, :
+                ],
+                self.global_remainder_local_score[
+                    :, :, global_remainder_st : global_remainder_st + self.block_size
+                ],
+            )
+            assert global_block_k.shape == (
+                self.num_units,
+                self.unit_size,
+                self.repr_topk,
+                self.dim_head,
+            )
+            global_block_k = global_block_k.mean(dim=-2, keepdim=False)
+            global_block_k = global_block_k.reshape(
+                self.num_units, self.unit_size * self.dim_head
+            )
+            global_block_k = global_block_k[:, None, :]
+            self.num_global_block += 1
+            for u in range(self.num_units):
+                self.block_k[u].append(global_block_k[u])
+            global_remainder_st += self.block_size
+        self._global_remainder_ed = global_remainder_ed
+        self._global_remainder_st = global_remainder_st
+    def append(
+        self,
+        local_q,
+        local_k,
+        local_v,
+        global_q,
+        global_k,
+        global_v,
+    ):
+        batch_size = local_q.size(0)
+        input_length = local_q.size(-2)
+        if self.perhead:
+            num_heads = local_q.size(1)
+            num_heads_kv = local_v.size(1)
+            def repeat_kv(t):
+                t = t.view(batch_size, num_heads_kv, 1, input_length, -1)
+                t = t.expand(
+                    batch_size,
+                    num_heads_kv,
+                    num_heads // num_heads_kv,
+                    input_length,
+                    -1,
+                )
+                t = t.reshape(batch_size * num_heads, 1, input_length, -1)
+                return t
+            local_q = local_q.view(batch_size * num_heads, 1, input_length, -1)
+            local_k = repeat_kv(local_k)
+            local_v = repeat_kv(local_v)
+            global_q = global_q.view(batch_size * num_heads, 1, input_length, -1)
+            global_k = repeat_kv(global_k)
+            global_v = repeat_kv(global_v)
+        if not self.initialized:
+            self.init(local_q, local_k, local_v, global_q, global_k, global_v)
+        input_length = local_q.size(-2)
+        if self.async_global_stream:
+            GLOBAL_STREAM.wait_stream(torch.cuda.current_stream())
+        # append local and global tensor
+        self.local_k = torch.cat((self.local_k, local_k), dim=-2)
+        self.local_v = torch.cat((self.local_v, local_v), dim=-2)
+        kv_length = self.local_k.size(-2)
+        if self.dense_decoding:
+            self.dense_k = torch.cat((self.dense_k, local_k), dim=-2)
+            self.dense_v = torch.cat((self.dense_v, local_v), dim=-2)
+        # append global remainder
+        with torch.cuda.stream(GLOBAL_STREAM):
+            self._global_remainder_st = 0
+            self._global_remainder_ed = self.global_remainder[0].size(-2)
+            self.global_remainder = (
+                torch.cat((self.global_remainder[0], global_k), dim=-2),
+                torch.cat((self.global_remainder[1], global_v), dim=-2),
+            )
+            self.global_remainder_local_score = torch.cat(
+                (
+                    self.global_remainder_local_score,
+                    torch.zeros(
+                        (self.num_units, self.unit_size, global_k.size(-2)),
+                        dtype=global_k.dtype,
+                        device=global_k.device,
+                    ),
+                ),
+                dim=-1,
+            )
+        with torch.cuda.stream(GLOBAL_STREAM):
+            global_q = self.position_embedding.apply_rotary_pos_emb_one_angle(
+                global_q, self.n_local
+            )
+        use_chunk_topk = self.chunk_topk_calc is not None and input_length > 1
+        self._use_chunk_topk = use_chunk_topk
+        if use_chunk_topk:
+            exc_block_num = input_length // self.exc_block_size
+            exc_block_per_topk_chunk = self.chunk_topk_calc // self.exc_block_size
+            calc_cur_list = [
+                i * self.exc_block_size
+                for i in range(0, exc_block_num + 1, exc_block_per_topk_chunk)
+            ]
+            if calc_cur_list[-1] < input_length:
+                calc_cur_list.append(input_length)
+            self._topk_cur = 0
+            self._topk_calc_cur = -1
+        o_list = []
+        for st in range(0, input_length, self.exc_block_size):
+            ed = min(st + self.exc_block_size, input_length)
+            if use_chunk_topk and calc_cur_list[self._topk_calc_cur + 1] < ed:
+                # calculate topk and sync with host here
+                assert ed <= calc_cur_list[self._topk_calc_cur + 2]
+                self._topk_calc_cur += 1
+                with torch.cuda.stream(GLOBAL_STREAM):
+                    self._cached_topk = self.get_batched_topk(
+                        global_q[
+                            :,
+                            :,
+                            calc_cur_list[self._topk_calc_cur] : calc_cur_list[
+                                self._topk_calc_cur + 1
+                            ],
+                            :,
+                        ]
+                    )
+                self._topk_cur = 0
+            kv_st = max(kv_length + st - input_length - self.n_local, 0)
+            kv_ed = kv_length + ed - input_length
+            chunk_o, local_score = self._append(
+                local_q[:, :, st:ed, :],
+                self.local_k[:, :, kv_st:kv_ed, :],
+                self.local_v[:, :, kv_st:kv_ed, :],
+                global_q[:, :, st:ed, :],
+            )
+            o_list.append(chunk_o)
+            # append global
+            with torch.cuda.stream(GLOBAL_STREAM):
+                self.append_global(ed - st, kv_ed - kv_st, local_score)
+            if self.async_global_stream:
+                torch.cuda.current_stream().wait_stream(GLOBAL_STREAM)
+            if use_chunk_topk:
+                self._topk_cur += 1
+        self.length += input_length
+        # update local and global tensor
+        if self.local_k.size(-2) >= self.n_local:
+            self.local_k = self.local_k[:, :, -self.n_local :, :]
+            self.local_v = self.local_v[:, :, -self.n_local :, :]
+        assert self._global_remainder_ed == self.global_remainder[0].size(-2)
+        with torch.cuda.stream(GLOBAL_STREAM):
+            self.global_remainder = (
+                self.global_remainder[0][:, :, self._global_remainder_st :, :],
+                self.global_remainder[1][:, :, self._global_remainder_st :, :],
+            )
+            self.global_remainder_local_score = self.global_remainder_local_score[
+                :, :, self._global_remainder_st :
+            ]
+        ret = torch.cat(o_list, dim=-2)
+        if self.perhead:
+            ret = ret.view(batch_size, num_heads, input_length, -1)
+        return ret
+    def size(self, *args, **kwargs):
+        return self.length
+def inf_llm_forward(
+    n_local,
+    n_init,
+    topk,
+    block_size,
+    max_cached_block,
+    exc_block_size,
+    repr_topk: int = 1,
+    cache_strategy="lru",
+    score_decay=None,
+    chunk_topk_calc=None,
+    async_global_stream=True,
+    pin_memory=False,
+    faiss=False,
+    perhead=False,
+    dense_decoding=False,
+    *args,
+    **kwargs
+):
+    def forward(
+        self,
+        query: torch.Tensor,
+        key_value: torch.Tensor,
+        position_bias: Optional[torch.Tensor],
+        use_cache: bool,
+        past_key_value,
+        project_q,
+        project_k,
+        project_v,
+        attention_out,
+        dim_head,
+        num_heads,
+        num_heads_kv,
+    ):
+        batch_size = query.size(0)
+        len_q = query.size(1)
+        len_k = key_value.size(1)
+        # assert use_cache
+        h_q = project_q(query)  # (batch, len_q, num_heads * dim_head)
+        h_k = project_k(key_value)  # (batch, len_k, num_heads * dim_head)
+        h_v = project_v(key_value)  # (batch, len_k, num_heads * dim_head)
+        h_q = (
+            h_q.view(batch_size, len_q, num_heads, dim_head)
+            .permute(0, 2, 1, 3)
+            .contiguous()
+        )  # (batch, num_heads, len_q, dim_head)
+        h_k = (
+            h_k.view(batch_size, len_k, num_heads_kv, dim_head)
+            .permute(0, 2, 1, 3)
+            .contiguous()
+        )  # (batch, num_heads_kv, len_k, dim_head)
+        h_v = (
+            h_v.view(batch_size, len_k, num_heads_kv, dim_head)
+            .permute(0, 2, 1, 3)
+            .contiguous()
+        )  # (batch, num_heads_kv, len_k, dim_head)
+        if len_q == 1 and dense_decoding:
+            past_k = past_key_value.dense_k
+            past_v = past_key_value.dense_v
+            h_k = torch.cat((past_k, h_k), dim=-2)
+            h_v = torch.cat((past_v, h_v), dim=-2)
+            past_key_value.dense_k = h_k
+            past_key_value.dense_v = h_v
+            h_q, h_k = position_bias(h_q, h_k)
+            # (batch_size, seqlen, nheads, headdim)
+            h_q = h_q.transpose(1, 2)
+            h_k = h_k.transpose(1, 2)
+            h_v = h_v.transpose(1, 2)
+            # (batch_size, seqlen, nheads, headdim)
+            o = flash_attn_func(h_q, h_k, h_v, causal=True)
+            o = o.reshape(batch_size, len_q, dim_head * num_heads)
+            o = attention_out(o)
+            if use_cache:
+                return o, past_key_value
+            else:
+                return o
+        if past_key_value is None:
+            past_key_value = ContextManager(
+                position_bias,
+                n_init,
+                n_local,
+                block_size,
+                max_cached_block,
+                topk,
+                exc_block_size,
+                score_decay,
+                repr_topk,
+                cache_strategy,
+                chunk_topk_calc,
+                async_global_stream,
+                pin_memory,
+                faiss,
+                perhead,
+                dense_decoding=dense_decoding,
+            )
+        local_q, local_k, local_v = h_q, h_k, h_v
+        global_q, global_k, global_v = h_q, h_k, h_v
+        o = past_key_value.append(
+            local_q,
+            local_k,
+            local_v,
+            global_q,
+            global_k,
+            global_v,
+        )
+        o = o.view(batch_size, num_heads, len_q, dim_head).permute(0, 2, 1, 3)
+        o = o.reshape(batch_size, len_q, dim_head * num_heads)
+        o = attention_out(o)
+        if use_cache:
+            return o, past_key_value
+        else:
+            return o
+    return forward
+class GreedySearch:
+    def __init__(self, model, tokenizer):
+        model.eval()
+        self.device = model.device
+        self.model = model
+        self.tokenizer = tokenizer
+        self.past_kv = None
+    def clear(self):
+        self.past_kv = None
+    def _process_texts(self, input_text):
+        model_inputs = {}
+        input_ids = self.tokenizer.encode(input_text)
+        model_inputs["input_ids"] = input_ids
+        model_inputs["attention_mask"] = [1] * len(model_inputs["input_ids"])
+        for key in model_inputs:
+            model_inputs[key] = (
+                torch.tensor(model_inputs[key]).int().unsqueeze(0).cuda()
+            )
+        return model_inputs
+    def generate(self, text=None, input_ids=None, **kwargs):
+        if input_ids is None:
+            model_inputs = self._process_texts(text)
+            input_ids = model_inputs["input_ids"]
+        with torch.inference_mode():
+            result = self._decode(input_ids, **kwargs)
+        self.clear()
+        return result
+    def _decode(
+        self,
+        input_ids,
+        max_length=100,
+        extra_end_token_ids=[],
+        chunk_size: int = 4096,
+        output=False,
+    ):
+        if input_ids.dim() == 1:
+            input_ids = input_ids[None, :]
+        input_ids = input_ids.cuda()
+        attention_mask = torch.ones_like(input_ids)
+        assert input_ids.size(0) == 1
+        length = input_ids.size(1)
+        end_token_ids = extra_end_token_ids + [self.tokenizer.eos_token_id]
+        logits = None
+        past_key_values = self.past_kv
+        if output:
+            output_text = ""
+        for i in range(max_length + 1):
+            if i == 0:
+                if chunk_size is None:
+                    chunk_size = input_ids.size(1)
+                for st in range(0, input_ids.size(1) - 1, chunk_size):
+                    ed = min(input_ids.size(1) - 1, st + chunk_size)
+                    out = self.model(
+                        input_ids=input_ids[:, st:ed],
+                        attention_mask=attention_mask[:, :ed],
+                        use_cache=True,
+                        return_dict=True,
+                        past_key_values=past_key_values,
+                    )
+                    logits, past_key_values = out.logits, out.past_key_values
+                out = self.model(
+                    input_ids=input_ids[:, -1:],
+                    attention_mask=attention_mask,
+                    use_cache=True,
+                    return_dict=True,
+                    past_key_values=past_key_values,
+                )
+                logits, past_key_values = out.logits, out.past_key_values
+            else:
+                out = self.model(
+                    input_ids=input_ids[:, -1:],
+                    attention_mask=attention_mask,
+                    past_key_values=past_key_values,
+                    use_cache=True,
+                    return_dict=True,
+                )
+                logits, past_key_values = out.logits, out.past_key_values
+            logits = logits[:, -1, :]
+            word = logits.argmax(dim=-1)
+            if word.item() in end_token_ids or i == max_length:
+                break
+            input_ids = torch.cat((input_ids, word.view(1, 1)), dim=-1)
+            attention_mask = torch.cat(
+                (
+                    attention_mask,
+                    torch.ones(
+                        (attention_mask.size(0), 1),
+                        dtype=torch.int,
+                        device=attention_mask.device,
+                    ),
+                ),
+                dim=-1,
+            )
+            if output:
+                tmp = self.tokenizer.decode(input_ids.squeeze(0)[length:])
+                if len(tmp) > len(output_text):
+                    import sys
+                    sys.stdout.write(tmp[len(output_text) :])
+                    sys.stdout.flush()
+                    output_text = tmp
+        self.past_kv = past_key_values
+        if output:
+            sys.stdout.write("\n")
+            sys.stdout.flush()
+        # return [self.tokenizer.decode(input_ids.squeeze(0)[length:])]
+        return input_ids
+class InfLLMGenerator(GreedySearch):
+    def generate(
+        self,
+        input_ids=None,
+        generation_config=None,
+        pad_token_id=None,
+        max_new_tokens=None,
+    ):
+        if max_new_tokens is not None:
+            max_new_tokens = max_new_tokens
+        else:
+            max_new_tokens = generation_config.max_new_tokens
+        return super().generate(
+            text=None,
+            input_ids=input_ids,
+            max_length=max_new_tokens,
+            chunk_size=8192,
+            extra_end_token_ids=[pad_token_id] if pad_token_id is not None else [],
+        )
+    @torch.no_grad()
+    def __call__(self, input_ids=None, *args, **kwargs):
+        # chunked forward
+        chunk_size = 8192
+        all_logits = torch.empty(0, dtype=torch.bfloat16).to(input_ids.device)
+        for st in range(0, input_ids.size(1), chunk_size):
+            torch.cuda.empty_cache()
+            ed = min(input_ids.size(1), st + chunk_size)
+            out = self.model(
+                input_ids=input_ids[:, st:ed],
+            )
+            logits = out.logits.to(torch.bfloat16)
+            all_logits = torch.cat((all_logits, logits), dim=1)
+        return CausalLMOutput(logits=all_logits)

minference/modules/minference_forward.py ADDED Viewed

	@@ -0,0 +1,855 @@

+import inspect
+import json
+import os
+from importlib import import_module
+from transformers.models.llama.modeling_llama import *
+from vllm.attention.backends.flash_attn import *
+from ..ops.block_sparse_flash_attention import block_sparse_attention
+from ..ops.pit_sparse_flash_attention_v2 import vertical_slash_sparse_attention
+from ..ops.streaming_kernel import streaming_forward, streaming_forward2
+from .snap_kv import *
+last_q = 64
+arange = torch.arange(last_q, device="cuda")
+LAST_Q_MASK = arange[None, None, :, None] >= arange[None, None, None, :]
+ROPE_TYPE = None
+SEARCH_MASK = None
+def init_minference_parameters(self):
+    config = self.config.to_dict()
+    self.starting_layer = config.get("starting_layer", 0)
+    self.is_search = config.get("is_search", False)
+    # self.n_init = config.get("n_init", 128)
+    # self.n_local = config.get("n_local", 3968)
+    self.ne_inf = None
+    self.config_path = config.get("config_path", "")
+    if os.path.exists(self.config_path) and self.layer_idx < len(json.load(open(self.config_path))):
+        self.best_pattern = {int(ii): jj for ii, jj in json.load(open(self.config_path))[self.layer_idx].items()}
+    else:
+        self.best_pattern = {}
+    self.vertical, self.slash = None, None
+    # import apply_rotary_pos_emb
+    if "apply_rotary_pos_emb" not in self.__dict__:
+        global apply_rotary_pos_emb
+        model_path = self.rotary_emb.__class__.__module__
+        apply_rotary_pos_emb = getattr(import_module(model_path), "apply_rotary_pos_emb")
+        self.apply_rotary_pos_emb = True
+def sum_all_diagonal_matrix(mat: torch.tensor):
+    b, h, n, m = mat.shape
+    zero_mat = torch.zeros((b, h, n, n)).to(mat.device) # Zero matrix used for padding
+    mat_padded =  torch.cat((zero_mat, mat, zero_mat), -1) # pads the matrix on left and right
+    mat_strided = mat_padded.as_strided((1, 1, n, n + m), (1, n * (2 * n + m), 2 * n + m + 1, 1)) # Change the strides
+    sum_diags = torch.sum(mat_strided, 2) # Sums the resulting matrix's columns
+    return sum_diags[:,:,1:]
+def gather(t, dim, i):
+    """A broadcasting version of torch.gather."""
+    dim += (dim < 0) * t.ndim
+    return t.gather(dim, i.expand(*t.shape[:dim], i.shape[dim], *t.shape[dim + 1 :]))
+def gather_qkv(q, k, v, attention_mask):
+    attn_weights = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(q.size(-1)) + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+    attn_output = torch.matmul(attn_weights, v)
+    return attn_output
+def search_pattern(q, k, head):
+    q_len = q.shape[2]
+    head_dim = q.shape[-1]
+    def vertical_and_slash(vertical_size, slash_size):
+        last_q = 64
+        q_len = q.shape[2]
+        qk_idxs = [ii + q_len for ii in list(range(-last_q, 0, 1))]
+        qk = torch.matmul(q[:,:,qk_idxs,:], k.transpose(2, 3))/ math.sqrt(head_dim) + attention_mask[:,:,qk_idxs]
+        qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
+        vertical = qk.sum(-2, keepdim=True)
+        vertical[...,:30] = 10000
+        vertical_topk = torch.topk(-vertical, q_len - vertical_size, -1).indices
+        slash = sum_all_diagonal_matrix(qk)[...,:-last_q + 1]
+        slash[...,-30:] = 10000
+        slash_topk = slash
+        slash = torch.topk(slash, slash_size, -1).indices - (q_len - 1)
+        slash = torch.stack([torch.sparse.spdiags(torch.ones(slash_size, q_len), slash.cpu()[0][_], (q_len, q_len)).to_dense() for _ in range(1)]).to(q.device)
+        est_attn = torch.ones_like(attn_weights)
+        dim = 3
+        est_attn = est_attn.scatter(3, vertical_topk.expand(*est_attn.shape[:dim], vertical_topk.shape[dim], *est_attn.shape[dim + 1 :]), 0)
+        est_attn = est_attn + slash
+        est_attn = (est_attn > 0).float()
+        est_attn = torch.tril(est_attn)
+        attn_weights_x = attn_weights * est_attn
+        res3 = attn_weights_x[:,:,2500:].sum(-1).mean(-1).squeeze().float().detach().cpu().numpy()
+        return res3
+    def stream_llm(vertical_size, slash_size):
+        q_len = q.shape[2]
+        mask = torch.triu(torch.tril(torch.ones(q_len, q_len), 0), -slash_size).to(q)
+        mask[:,:vertical_size] = 1
+        mask = mask.unsqueeze(0).unsqueeze(1)
+        est_attn = torch.tril(mask)
+        attn_weights_x = attn_weights * est_attn
+        res3 = attn_weights_x[:,:,2500:].sum(-1).mean(-1).squeeze().float().detach().cpu().numpy()
+        return res3
+    def block_sparse(topk_ratio, slash_size=None):
+        block_num = (q_len -1) // 32 + 1
+        block_q = torch.zeros(1,1,block_num * 32,head_dim).to(q)
+        block_q[:,:,:q_len] = q
+        block_q = block_q.reshape(1,1,block_num,32,-1).mean(-2)
+        block_k = torch.zeros(1,1,block_num * 32,head_dim).to(k)
+        block_k[:,:,:q_len] = k
+        block_k = block_k.reshape(1,1,block_num,32,-1).mean(-2)
+        qk = torch.matmul(block_q, block_k.transpose(2, 3)) + attention_mask[:,:,:block_num,:block_num]
+        est_attn = torch.ones_like(qk)
+        block_topk = torch.topk(-qk, block_num - block_num//topk_ratio, -1).indices
+        dim = 3
+        est_attn = est_attn.scatter(3, block_topk.expand(*est_attn.shape[:dim], block_topk.shape[dim], *est_attn.shape[dim + 1 :]), 0)
+        est_attn = est_attn.unsqueeze(3).unsqueeze(-1).repeat(1,1,1,32,1,32).reshape(1,1,block_num * 32, block_num * 32)[...,:q_len,:q_len]
+        est_attn = torch.tril(est_attn)
+        attn_weights_x = attn_weights * est_attn
+        res2 = attn_weights_x[:,:,2500:].sum(-1).mean(-1).squeeze().float().detach().cpu().numpy()
+        return res2
+    global SEARCH_MASK
+    if SEARCH_MASK is None:
+        attention_mask = torch.full((q_len, q_len), torch.finfo(q.dtype).min, device="cuda")
+        mask_cond = torch.arange(attention_mask.size(-1), device="cuda")
+        attention_mask.masked_fill_(mask_cond < (mask_cond + 1).view(attention_mask.size(-1), 1), 0)
+        attention_mask = attention_mask[None, None, :]
+        SEARCH_MASK = attention_mask
+    else:
+        attention_mask = SEARCH_MASK
+    attn_weights = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(head_dim) + attention_mask
+    attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+    best_s, best_v, best_score, best_ty = 0, 0, 0, ""
+    all_info = []
+    for ty, fc in [("stream_llm", stream_llm), ("vertical_and_slash", vertical_and_slash), ("block_sparse", block_sparse)]:
+        if ty == "stream_llm":
+            vs_list = [(100, 800)]
+        elif ty == "vertical_and_slash":
+            vs_list = [(30, 800), (100, 750), (500, 700), (3500, 100)]
+        else:
+            vs_list = [(8, 1)]
+        for v_size, s_size in vs_list:
+            score = fc(v_size, s_size)
+            score = score.item()
+            all_info.append([ty, v_size, s_size, score])
+            if score > best_score:
+                best_score = score
+                best_s, best_v = s_size, v_size
+                best_ty = ty
+    if best_ty == "stream_llm":
+        best_ty = "vertical_and_slash"
+    if best_ty == "block_sparse":
+        best_ty, best_v, best_s = "vertical_and_slash", 1000, 6096
+    print(head, best_ty, best_v, best_s, best_score)
+    return (best_ty, best_v, best_s, best_score)
+def search_pattern_v2(q, k, v, head):
+    q_len = q.shape[2]
+    head_dim = q.shape[-1]
+    def vertical_and_slash_kernel(q, k, v, vertical_size, slash_size):
+        vertical_size, slash_size  = min(q_len, max(vertical_size, 30)), min(q_len, max(slash_size, 50))
+        last_q = 64
+        qk = torch.einsum(f'bhmk, bhnk -> bhmn', q[:,:,-last_q:,:], k)
+        qk[:, :, :, -last_q:] = torch.where(LAST_Q_MASK, qk[:, :, :, -last_q:], -torch.inf)
+        qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
+        vertical = qk.sum(-2, keepdim=True)
+        vertical[...,:30] = torch.inf
+        vertical_topk = torch.topk(vertical, vertical_size, -1).indices
+        slash = sum_all_diagonal_matrix(qk)[...,:-last_q + 1]
+        slash[...,-30:] = torch.inf
+        slash_topk = slash
+        slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
+        return vertical_slash_sparse_attention(q, k, v, vertical_topk, slash)
+    def dense(q, k, v, vertical_size=None, slash_size=None):
+        return flash_attn_func(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1,2), 0.0, softmax_scale=None, causal=q_len != 1).view(bsz, 1, q_len, head_dim)
+    def block_sparse_kernel(q, k, v, vertical_size=None, slash_size=None):
+        topk = 100
+        return block_sparse_attention(q, k, v, topk)
+    best_s, best_v, best_score, best_ty = 0, 0, float("inf"), ""
+    bsz = q.shape[0]
+    all_info = []
+    ref = dense(q, k, v)
+    for ty, fc in [("stream_llm", streaming_forward), ("vertical_and_slash", vertical_and_slash_kernel), ("block_sparse", block_sparse_kernel)]:
+        if ty == "stream_llm":
+            vs_list = [(100, 800)]
+        elif ty == "vertical_and_slash":
+            vs_list = [(30, 800), (100, 800), (100, 750), (500, 700), (3500, 100), (1000, 4096)]
+        else:
+            vs_list = [(10, 1)]
+        for v_size, s_size in vs_list:
+            score = fc(q, k, v, v_size, s_size)
+            # delta = (ref - score).abs().sum()
+            delta = ((ref - score).abs() > 5e-3).sum()
+            score = delta.item()
+            all_info.append([ty, v_size, s_size, score])
+            if score < best_score:
+                best_score = score
+                best_s, best_v = s_size, v_size
+                best_ty = ty
+    print(head, best_ty, best_v, best_s, best_score)
+    return all_info
+def shift_matrix(mat):
+    b, h, _, n = mat.shape
+    zero_mat = torch.zeros((b, h, n, n)).to(mat.device) # Zero matrix used for padding
+    mat_padded =  torch.cat((zero_mat, mat, zero_mat), -1) # pads the matrix on left and right
+    mat_strided = mat_padded.as_strided((1, 1, n, n + 2 * n), (1, n * (2 * n + n), 2 * n + n - 1, 1)) # Change the strides
+    return mat_strided[...,2 * n-1:-1]
+def repeat(self, q, k, v, attention_mask):
+    q_len = q.shape[2]
+    if q_len == 1:
+        return gather_qkv(q, k, v, attention_mask)
+    qk = torch.matmul(q[:,:,-1:,:], k.transpose(2, 3)) / math.sqrt(self.head_dim)
+    qk = qk.repeat(1,1,q_len, 1)
+    qk = shift_matrix(qk) + attention_mask
+    attn_weights = nn.functional.softmax(qk, dim=-1, dtype=torch.float32).to(q.dtype)
+    attn_output = torch.matmul(attn_weights, v)
+    return attn_output
+def gather_last_q_vertical_slash_topk_v4(self, q, k, v, head_id):
+    kv_seq_len = k.size(2)
+    def vertical_and_slash(attn_weights, vertical_size, slash_size):
+        last_q = 64
+        q_len = q.shape[2]
+        vertical_size, slash_size  = min(q_len, max(vertical_size, 30)), min(q_len, max(slash_size, 50))
+        qk_idxs = [ii + q_len for ii in list(range(-last_q, 0, 1))]
+        qk = torch.matmul(q[:,:,qk_idxs,:], k.transpose(2, 3))/ math.sqrt(self.head_dim) + attention_mask[:,:,qk_idxs]
+        qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
+        vertical = qk.sum(-2, keepdim=True)
+        vertical[...,:30] = -self.ne_inf
+        vertical_topk = torch.topk(-vertical, q_len - vertical_size, -1).indices
+        slash = sum_all_diagonal_matrix(qk)[...,:-last_q + 1]
+        slash[...,-30:] = -self.ne_inf
+        slash_topk = slash
+        slash = torch.topk(slash, slash_size, -1).indices - (q_len - 1)
+        slash = torch.stack([torch.sparse.spdiags(torch.ones(slash_size, q_len), slash.cpu()[0][_], (q_len, q_len)).to_dense() for _ in range(1)]).to(q.device)
+        est_attn = torch.ones_like(attn_weights)
+        dim = 3
+        est_attn = est_attn.scatter(3, vertical_topk.expand(*est_attn.shape[:dim], vertical_topk.shape[dim], *est_attn.shape[dim + 1 :]), 0)
+        est_attn = est_attn + slash
+        est_attn = (est_attn > 0).float()
+        est_attn = torch.tril(est_attn)
+        est_attn = (est_attn == 0).int() * self.ne_inf
+        attn_weights = attn_weights + est_attn
+        if self.kv_cache_compressed_v4:
+            self.vertical = torch.topk(vertical, vertical_size * 4, -1).indices
+            self.slash = (torch.topk(slash_topk, slash_size * 4, -1).indices - (q_len - 1)).unsqueeze(2)
+        return attn_weights
+    def stream_llm(attn_weights, vertical_size, slash_size):
+        q_len = q.shape[2]
+        vertical_size, slash_size = min(q_len, max(vertical_size, 30)), min(q_len, max(slash_size, 50))
+        mask = torch.triu(torch.tril(torch.ones(q_len, q_len), 0), -slash_size).to(q)
+        mask[:,:vertical_size] = 1
+        mask = mask.unsqueeze(0).unsqueeze(1)
+        est_attn = torch.tril(mask)
+        est_attn = (est_attn == 0).int() * self.ne_inf
+        attn_weights = attn_weights + est_attn
+        if self.kv_cache_compressed_v4:
+            self.vertical = torch.Tensor(list(range(vertical_size * 4))).long().to(q.device).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+            self.slash = torch.Tensor(list(range(-slash_size * 4, 1))).long().to(q.device).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+        return attn_weights
+    def block_sparse(attn_weights, topk_ratio, slash_size=None, block_size=8):
+        block_num = (q_len -1) // block_size + 1
+        block_q = torch.zeros(1,1,block_num * block_size,head_dim).to(q)
+        block_q[:,:,:q_len] = q
+        block_q = block_q.reshape(1,1,block_num,block_size,-1).mean(-2)
+        block_k = torch.zeros(1,1,block_num * block_size,head_dim).to(k)
+        block_k[:,:,:q_len] = k
+        block_k = block_k.reshape(1,1,block_num,block_size,-1).mean(-2)
+        qk = torch.matmul(block_q, block_k.transpose(2, 3)) + attention_mask[:,:,:block_num,:block_num]
+        est_attn = torch.ones_like(qk)
+        block_topk = torch.topk(-qk, block_num - block_num//topk_ratio, -1).indices
+        dim = 3
+        est_attn = est_attn.scatter(3, block_topk.expand(*est_attn.shape[:dim], block_topk.shape[dim], *est_attn.shape[dim + 1 :]), 0)
+        est_attn = est_attn.unsqueeze(3).unsqueeze(-1).repeat(1,1,1,block_size,1,block_size).reshape(1,1,block_num * block_size, block_num * block_size)[...,:q_len,:q_len]
+        est_attn = torch.tril(est_attn)
+        est_attn = (est_attn == 0).int()
+        attn_weights = attn_weights + est_attn
+        return attn_weights
+    def dialted(q,k,v, type):
+        q_len = q.shape[2]
+        n_init = min(1024, q_len)
+        vertical_topk = torch.arange(0, n_init, device=q.device)[None, None, None, :]
+        slash = torch.arange(0, q_len, device=q.device)
+        if type == 'dilated1':
+            # 8k local with 1 interval
+            slash = slash[-8192::2][None, None, :]
+        elif type == 'dilated2':
+            # 2k dense local + 4k local with 1 interval
+            slash = torch.cat([slash[-2048:], slash[-6144:-2048:2]], 0)[None, None, :]
+        slash = (q_len - 1) - slash
+        return vertical_slash_sparse_attention(q, k, v, vertical_topk, slash)
+    def vertical_and_slash_kernel(q, k, v, vertical_size, slash_size):
+        vertical_size, slash_size  = min(q_len, max(vertical_size, 30)), min(q_len, max(slash_size, 50))
+        last_q = min(64, q_len)
+        qk = torch.einsum(f'bhmk, bhnk -> bhmn', q[:,:,-last_q:,:], k)
+        qk[:, :, :, -last_q:] = torch.where(LAST_Q_MASK[...,-last_q:,-last_q:].to(q.device), qk[:, :, :, -last_q:], -torch.inf)
+        qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
+        vertical = qk.sum(-2, keepdim=True)
+        vertical[...,:30] = torch.inf
+        vertical_topk = torch.topk(vertical, vertical_size, -1).indices
+        slash = sum_all_diagonal_matrix(qk)[...,:-last_q + 1]
+        slash[...,-100:] = torch.inf
+        slash_topk = slash
+        slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
+        return vertical_slash_sparse_attention(q, k, v, vertical_topk, slash)
+    def vertical_and_slash_kernel_static(q, k, v, vertical_size, slash_size):
+        if "vs" in self.__dict__:
+            vertical_topk, slash = self.vs
+        else:
+            vertical_size, slash_size  = min(q_len, max(vertical_size, 30)), min(q_len, max(slash_size, 50))
+            last_q = 64
+            qk = torch.einsum(f'bhmk, bhnk -> bhmn', q[:,:,-last_q:,:], k)
+            qk[:, :, :, -last_q:] = torch.where(LAST_Q_MASK, qk[:, :, :, -last_q:], -torch.inf)
+            qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
+            vertical = qk.sum(-2, keepdim=True)
+            vertical[...,:30] = torch.inf
+            vertical_topk = torch.topk(vertical, vertical_size, -1).indices
+            slash = sum_all_diagonal_matrix(qk)[...,:-last_q + 1]
+            slash[...,-30:] = torch.inf
+            slash_topk = slash
+            slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
+            self.vs = vertical_topk, slash
+        return vertical_slash_sparse_attention(q, k, v, vertical_topk, slash)
+    def dense(q, k, v, vertical_size=None, slash_size=None):
+        return flash_attn_func(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1,2), 0.0, softmax_scale=None, causal=q_len != 1).view(bsz, 1, q_len, self.head_dim)
+    def block_sparse_kernel(q, k, v, vertical_size=None, slash_size=None):
+        topk = 100
+        return block_sparse_attention(q, k, v, topk)
+    q_len = q.shape[2]
+    bsz = q.shape[0]
+    if self.config.to_dict().get("dilated1", False):
+        return dialted(q, k, v, 'dilated1')
+    if self.config.to_dict().get("dilated2", False):
+        return dialted(q, k, v, 'dilated2')
+    if self.config.to_dict().get("dense", False):
+        return dense(q, k, v)
+    if self.config.to_dict().get("streaming", False):
+        return streaming_forward(q, k, v, self.config.streaming_kwargs["n_init"], self.config.streaming_kwargs["n_local"])
+    ty, vertical_size, slash_size, _ = self.best_pattern.get(head_id, ("vertical_and_slash", 1000, 6096, 1))
+    if self.config.to_dict().get("static_pattern", False):
+        return vertical_and_slash_kernel_static(q, k, v, vertical_size, slash_size)
+    if self.config.to_dict().get("vs_only", False):
+        return vertical_and_slash_kernel(q, k, v, vertical_size, slash_size)
+    if q_len == 1:
+        return dense(q, k, v)
+    fc = {
+        "stream_llm": streaming_forward,
+        "vertical_and_slash": vertical_and_slash_kernel,
+        "block_sparse": block_sparse_kernel,
+    }[ty]
+    return fc(q, k, v, vertical_size, slash_size)
+def apply_rotary_pos_emb_single(q, cos, sin, position_ids, unsqueeze_dim=1):
+    # cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    # sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    return (q * cos) + (rotate_half(q) * sin)
+def minference_forward():
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value,
+        output_attentions,
+        use_cache,
+        **kwargs,
+    ):
+        self.init_minference_parameters()
+        self.ne_inf = torch.finfo(hidden_states.dtype).min
+        bsz, q_len, _ = hidden_states.size()
+        if "q_proj" in self.__dict__["_modules"]:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+        else:
+            qkv = self.qkv_proj(hidden_states)
+            query_pos = self.num_heads * self.head_dim
+            query_states, key_states, value_states = torch.split(qkv, query_pos, -1)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        global ROPE_TYPE
+        if ROPE_TYPE is None:
+            ROPE_TYPE = "seq_len" in inspect.signature(self.rotary_emb.forward).parameters
+        if ROPE_TYPE:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        if self.is_search:
+            if os.path.exists(self.config_path):
+                config_list = json.load(open(self.config_path))
+                if self.layer_idx < len(config_list):
+                    assert False
+            else:
+                config_list = []
+            config = {}
+            print("Layer", self.layer_idx)
+        if q_len != 1:
+            output = torch.empty_like(query_states)
+            for head in range(query_states.size(1)):
+                q = query_states[:, head, :, :].unsqueeze(1)
+                k = key_states[:, head, :, :].unsqueeze(1)
+                v = value_states[:, head, :, :].unsqueeze(1)
+                if self.is_search:
+                    config[head] = search_pattern(q, k, head)
+                if self.layer_idx >= self.starting_layer and not self.is_search:
+                    attn_output = self.gather_last_q_vertical_slash_topk_v4(q, k, v, head)
+                elif is_flash_attn_2_available():
+                    attn_output = flash_attn_func(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1,2), 0.0, softmax_scale=None, causal=q_len != 1).view(bsz, 1, q_len, self.head_dim)
+                else:
+                    attn_output = gather_qkv(q, k, v, attention_mask)
+                output[:, head:head + 1] = attn_output
+            if self.is_search:
+                config_list.append(config)
+                with open(self.config_path, 'w') as json_file:
+                    json.dump(config_list, json_file)
+        else:
+            output =  flash_attn_func(query_states.transpose(1, 2), key_states.transpose(1, 2), value_states.transpose(1,2), 0.0, softmax_scale=None, causal=q_len != 1).view(bsz, query_states.size(1), q_len, self.head_dim)
+        attn_output = output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None, past_key_value
+    return forward
+def minference_kv_cache_cpu_forward():
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value,
+        output_attentions,
+        use_cache,
+        **kwargs,
+    ):
+        self.init_minference_parameters()
+        self.ne_inf = torch.finfo(hidden_states.dtype).min
+        bsz, q_len, hidden_dim = hidden_states.size()
+        kv_seq_len = q_len
+        if use_cache and past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        global ROPE_TYPE
+        if ROPE_TYPE is None:
+            ROPE_TYPE = "seq_len" in inspect.signature(self.rotary_emb.forward).parameters
+        if ROPE_TYPE:
+            cos, sin = self.rotary_emb(hidden_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = self.rotary_emb(hidden_states, position_ids)
+        cache_kwargs = {"sin": sin, "cos": cos}
+        attn_out = torch.empty_like(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim)
+        act_num_heads = self.num_heads // self.num_key_value_groups
+        if use_cache:
+            k = torch.zeros(bsz, act_num_heads, q_len, self.head_dim).to(hidden_states.dtype).cpu()
+            v = torch.zeros(bsz, act_num_heads, q_len, self.head_dim).to(hidden_states.dtype).cpu()
+        part_k, part_v = None, None
+        for head in range(self.num_heads):
+            if "q_proj" in self.__dict__["_modules"]:
+                part_q = F.linear(hidden_states, self.q_proj.weight.view(self.num_heads, self.head_dim, hidden_dim)[head]).unsqueeze(2)
+            else:
+                part_q = F.linear(hidden_states, self.qkv_proj.weight.view(3, self.num_heads, self.head_dim, hidden_dim)[0][head]).unsqueeze(2)
+            part_q = apply_rotary_pos_emb_single(part_q.transpose(1, 2), cos, sin, position_ids)
+            if head % self.num_key_value_groups == 0:
+                if "q_proj" in self.__dict__["_modules"]:
+                    part_k = F.linear(hidden_states, self.k_proj.weight.view(act_num_heads, self.head_dim, hidden_dim)[head // self.num_key_value_groups]).unsqueeze(2)
+                    part_v = F.linear(hidden_states, self.v_proj.weight.view(act_num_heads, self.head_dim, hidden_dim)[head // self.num_key_value_groups]).unsqueeze(2).transpose(1, 2)
+                else:
+                    part_k = F.linear(hidden_states, self.qkv_proj.weight.view(3, act_num_heads, self.head_dim, hidden_dim)[1][head // self.num_key_value_groups]).unsqueeze(2)
+                    part_v = F.linear(hidden_states, self.qkv_proj.weight.view(3, act_num_heads, self.head_dim, hidden_dim)[2][head // self.num_key_value_groups]).unsqueeze(2).transpose(1, 2)
+                part_k = apply_rotary_pos_emb_single(part_k.transpose(1, 2), cos, sin, position_ids)
+                if use_cache and past_key_value is not None:
+                    k[:,head // self.num_key_value_groups] = part_k.cpu()
+                    v[:,head // self.num_key_value_groups] = part_v.cpu()
+                    part_k, part_v = past_key_value.get(part_k, part_v, self.layer_idx, head // self.num_key_value_groups, cache_kwargs)
+            if self.layer_idx >= self.starting_layer:
+                part_o = self.gather_last_q_vertical_slash_topk_v4(part_q, part_k, part_v, head)
+            else:
+                part_o = flash_attn_func(part_q, part_k, part_v.transpose(1, 2), 0.0, softmax_scale=None, causal=True).view(bsz, part_q.shape[1], self.head_dim)
+            attn_out[:, :, head, :] = part_o
+        if use_cache and past_key_value is not None:
+            past_key_value.update(k, v, self.layer_idx, cache_kwargs)
+        torch.matmul(attn_out.view(bsz, q_len, hidden_dim), self.o_proj.weight.T, out=hidden_states)
+        torch.cuda.empty_cache()
+        return (hidden_states, None, past_key_value)
+    return forward
+def minference_with_snapkv_forward():
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        past_key_value,
+        output_attentions,
+        use_cache,
+        **kwargs,
+    ):
+        self.init_minference_parameters()
+        self.ne_inf = torch.finfo(hidden_states.dtype).min
+        init_snapkv(self)
+        bsz, q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            if hasattr(self, "kv_seq_len"): #[SnapKV] add kv_seq_len
+                if self.kv_seq_len != 0:
+                    kv_seq_len += self.kv_seq_len
+                else:
+                    kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+            else:
+                kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        global ROPE_TYPE
+        if ROPE_TYPE is None:
+            ROPE_TYPE = "seq_len" in inspect.signature(self.rotary_emb.forward).parameters
+        if ROPE_TYPE:
+            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        else:
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            if key_states.shape[-2] == kv_seq_len: # [SnapKV] add kv_cluster
+                self.kv_seq_len = kv_seq_len # [SnapKV] register kv_seq_len
+                key_states_compress, value_states_compress = self.kv_cluster.update_kv(key_states, query_states, value_states, attention_mask, self.num_key_value_groups)
+                past_key_value.update(key_states_compress, value_states_compress, self.layer_idx, cache_kwargs)
+            else:
+                self.kv_seq_len += q_len
+                key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        if self.layer_idx >= self.starting_layer:
+            assert query_states.size(1) == key_states.size(1) == value_states.size(1)
+            output = torch.empty_like(query_states)
+            for head in range(query_states.size(1)):
+                q = query_states[:, head, :, :].unsqueeze(1)
+                k = key_states[:, head, :, :].unsqueeze(1)
+                v = value_states[:, head, :, :].unsqueeze(1)
+                output[:, head:head + 1] = self.gather_last_q_vertical_slash_topk_v4(q, k, v, head)
+            attn_output = output.transpose(1, 2).contiguous()
+            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+            attn_output = self.o_proj(attn_output)
+            return attn_output, None, past_key_value
+        else:
+            output = torch.empty_like(query_states)
+            for head in range(query_states.size(1)):
+                q = query_states[:, head, :, :].unsqueeze(1)
+                k = key_states[:, head, :, :].unsqueeze(1)
+                v = value_states[:, head, :, :].unsqueeze(1)
+                if is_flash_attn_2_available():
+                    attn_output = flash_attn_func(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1,2), 0.0, softmax_scale=None, causal=q_len != 1).view(bsz, 1, q.shape[2], self.head_dim)
+                else:
+                    attn_output = gather_qkv(q, k, v, attention_mask)
+                output[:, head:head + 1] = attn_output
+            attn_output = output.transpose(1, 2).contiguous()
+            attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+            attn_output = self.o_proj(attn_output)
+            return attn_output, None, past_key_value
+    return forward
+def gather_last_q_vertical_slash_topk_vllm(self, q, k, v, head_id):
+    kv_seq_len = k.size(2)
+    head_dim = q.size(-1)
+    def vertical_and_slash_kernel(q, k, v, vertical_size, slash_size):
+        vertical_size, slash_size  = min(q_len, max(vertical_size, 30)), min(q_len, max(slash_size, 50))
+        last_q = min(64, q_len)
+        qk = torch.einsum(f'bhmk, bhnk -> bhmn', q[:,:,-last_q:,:], k)
+        qk[:, :, :, -last_q:] = torch.where(LAST_Q_MASK[...,-last_q:,-last_q:], qk[:, :, :, -last_q:], -torch.inf)
+        qk = torch.nn.functional.softmax(qk, dim=-1, dtype=torch.float32)
+        vertical = qk.sum(-2, keepdim=True)
+        vertical[...,:30] = torch.inf
+        vertical_topk = torch.topk(vertical, vertical_size, -1).indices
+        slash = sum_all_diagonal_matrix(qk)[...,:-last_q + 1]
+        slash[...,-100:] = torch.inf
+        slash_topk = slash
+        slash = (q_len - 1) - torch.topk(slash, slash_size, -1).indices
+        return vertical_slash_sparse_attention(q, k, v, vertical_topk, slash)
+    def block_sparse_kernel(q, k, v, vertical_size=None, slash_size=None):
+        topk = 100
+        return block_sparse_attention(q, k, v, topk)
+    def dense(q, k, v, vertical_size=None, slash_size=None):
+        return flash_attn_func(q.transpose(1, 2), k.transpose(1, 2), v.transpose(1,2), 0.0, softmax_scale=None, causal=q_len != 1).view(bsz, 1, q_len, head_dim)
+    q_len = q.shape[2]
+    bsz = q.shape[0]
+    ty, vertical_size, slash_size, _ = self.best_pattern[head_id]
+    if q_len == 1:
+        return dense(q, k, v)
+    fc = {
+        "stream_llm": streaming_forward,
+        "vertical_and_slash": vertical_and_slash_kernel,
+        "block_sparse": block_sparse_kernel,
+    }[ty]
+    return fc(q, k, v, vertical_size, slash_size)
+def minference_vllm_forward(
+    pattern_config
+):
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: torch.Tensor,
+        attn_metadata: AttentionMetadata[FlashAttentionMetadata],
+        kv_scale: float,
+        layer_idx: int,
+    ) -> torch.Tensor:
+        """Forward pass with FlashAttention and PagedAttention.
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache = [2, num_blocks, block_size * num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+        self.best_pattern = {int(ii): jj for ii, jj in pattern_config[layer_idx].items()}
+        def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+            """
+            This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+            num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+            """
+            slen, num_key_value_heads, head_dim = hidden_states.shape
+            if n_rep == 1:
+                return hidden_states
+            hidden_states = hidden_states[:, None, :, :].expand(slen, n_rep, num_key_value_heads, head_dim)
+            return hidden_states.reshape(slen, num_key_value_heads * n_rep, head_dim)
+        def minference_prefill_func(
+            q, k, v,
+        ):
+            # (seq_len, num_heads, head_size)
+            if q.size(-2) != k.size(-2):
+                k = repeat_kv(k, q.size(-2) // k.size(-2))
+                v = repeat_kv(v, q.size(-2) // v.size(-2))
+            output = torch.empty_like(q)
+            for head in range(q.size(-2)):
+                q_head = q[:, head, :].unsqueeze(1)
+                k_head = k[:, head, :].unsqueeze(1)
+                v_head = v[:, head, :].unsqueeze(1)
+                # (1, seq_len, num_heads, head_size)
+                q_head = q_head[None, ...]
+                k_head = k_head[None, ...]
+                v_head = v_head[None, ...]
+                q_head = q_head.transpose(1, 2)
+                k_head = k_head.transpose(1, 2)
+                v_head = v_head.transpose(1, 2)
+                out = self.gather_last_q_vertical_slash_topk_vllm(q_head, k_head, v_head, head)
+                out = out.transpose(1, 2).squeeze(0).contiguous()
+                output[:, head:head+1, :] = out
+            return output
+        num_tokens, hidden_size = query.shape
+        # Reshape the query, key, and value tensors.
+        query = query.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)
+        if kv_cache is not None:
+            key_cache, value_cache = PagedAttention.split_kv_cache(
+                kv_cache, self.num_kv_heads, self.head_size)
+            # Reshape the input keys and values and store them in the cache.
+            # If kv_cache is not provided, the new key and value tensors are
+            # not cached. This happens during the initial memory profiling run.
+            PagedAttention.write_to_paged_cache(key, value, key_cache,
+                                                value_cache,
+                                                attn_metadata.slot_mapping,
+                                                attn_metadata.kv_cache_dtype,
+                                                kv_scale)
+        num_prefill_tokens = attn_metadata.num_prefill_tokens
+        num_decode_tokens = attn_metadata.num_decode_tokens
+        assert key.shape[0] == num_prefill_tokens + num_decode_tokens
+        assert value.shape[0] == num_prefill_tokens + num_decode_tokens
+        output = torch.empty_like(query)
+        # Query for decode. KV is not needed because it is already cached.
+        decode_query = query[num_prefill_tokens:]
+        # QKV for prefill.
+        query = query[:num_prefill_tokens]
+        key = key[:num_prefill_tokens]
+        value = value[:num_prefill_tokens]
+        assert query.shape[0] == num_prefill_tokens
+        assert decode_query.shape[0] == num_decode_tokens
+        if prefill_meta := attn_metadata.prefill_metadata:
+            # Prompt run.
+            if kv_cache is None or prefill_meta.block_tables.numel() == 0:
+                # normal attention
+                # When block_tables are not filled, it means q and k are the
+                # prompt, and they have the same length.
+                # (seq_len, num_heads, head_size)
+                # out = flash_attn_varlen_func(
+                #     q=query,
+                #     k=key,
+                #     v=value,
+                #     cu_seqlens_q=prefill_meta.seq_start_loc,
+                #     cu_seqlens_k=prefill_meta.seq_start_loc,
+                #     max_seqlen_q=prefill_meta.max_prompt_len,
+                #     max_seqlen_k=prefill_meta.max_prompt_len,
+                #     softmax_scale=self.scale,
+                #     causal=True,
+                #     window_size=self.sliding_window,
+                #     alibi_slopes=self.alibi_slopes,
+                # )
+                out = minference_prefill_func(query, key, value)
+                assert output[:num_prefill_tokens].shape == out.shape
+                output[:num_prefill_tokens] = out
+            else:
+                # prefix-enabled attention
+                # TODO(Hai) this triton kernel has regression issue (broke) to
+                # deal with different data types between KV and FP8 KV cache,
+                # to be addressed separately.
+                output[:num_prefill_tokens] = PagedAttention.forward_prefix(
+                    query,
+                    key,
+                    value,
+                    key_cache,
+                    value_cache,
+                    prefill_meta.block_tables,
+                    prefill_meta.subquery_start_loc,
+                    prefill_meta.prompt_lens_tensor,
+                    prefill_meta.context_lens,
+                    prefill_meta.max_subquery_len,
+                    self.alibi_slopes,
+                )
+        if decode_meta := attn_metadata.decode_metadata:
+            # Decoding run.
+            output[num_prefill_tokens:] = PagedAttention.forward_decode(
+                decode_query,
+                key_cache,
+                value_cache,
+                decode_meta.block_tables,
+                decode_meta.context_lens,
+                decode_meta.max_context_len,
+                attn_metadata.kv_cache_dtype,
+                self.num_kv_heads,
+                self.scale,
+                self.alibi_slopes,
+                kv_scale,
+            )
+        # Reshape the output tensor.
+        return output.view(num_tokens, hidden_size)
+    return forward

minference/modules/snap_kv.py ADDED Viewed

	@@ -0,0 +1,422 @@

+import math
+import time
+import warnings
+from importlib.metadata import version
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import transformers
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+# https://github.com/huggingface/transformers/blob/v4.37-release/src/transformers/models/llama/modeling_llama.py
+def llama_flash_attn2_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.LongTensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Cache] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+    **kwargs,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # [SnapKV] register kv_cluster
+    init_snapkv(self)
+    # LlamaFlashAttention2 attention does not support output_attentions
+    if "padding_mask" in kwargs:
+        warnings.warn(
+            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+        )
+        # overwrite attention_mask with padding_mask
+        attention_mask = kwargs.pop("padding_mask")
+    output_attentions = False
+    bsz, q_len, _ = hidden_states.size()
+    query_states = self.q_proj(hidden_states)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+    # Flash attention requires the input to have the shape
+    # batch_size x seq_length x head_dim x hidden_dim
+    # therefore we just need to keep the original shape
+    query_states = query_states.view(
+        bsz, q_len, self.num_heads, self.head_dim
+    ).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    kv_seq_len = key_states.shape[-2]
+    # if past_key_value is not None:
+    #     kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+    if past_key_value is not None:
+        if self.layer_idx is None:
+            raise ValueError(
+                f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                "with a layer index."
+            )
+        if hasattr(self, "kv_seq_len"):  # [SnapKV] add kv_seq_len
+            if self.kv_seq_len != 0:
+                kv_seq_len += self.kv_seq_len
+            else:
+                kv_seq_len += past_key_value.get_usable_length(
+                    kv_seq_len, self.layer_idx
+                )
+        else:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [SnapKV] move to ahead
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    if past_key_value is not None:
+        cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+        # key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        # print('kv_seq_len:', kv_seq_len)
+        # print('key_states.shape:', key_states.shape)
+        if key_states.shape[-2] == kv_seq_len:  # [SnapKV] add kv_cluster
+            self.kv_seq_len = kv_seq_len  # [SnapKV] register kv_seq_len
+            key_states_compress, value_states_compress = self.kv_cluster.update_kv(
+                key_states,
+                query_states,
+                value_states,
+                attention_mask,
+                self.num_key_value_groups,
+            )
+            past_key_value.update(
+                key_states_compress, value_states_compress, self.layer_idx, cache_kwargs
+            )
+        else:
+            self.kv_seq_len += q_len
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
+    # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+    # to be able to avoid many of these transpose/reshape/view.
+    query_states = query_states.transpose(1, 2)
+    key_states = key_states.transpose(1, 2)
+    value_states = value_states.transpose(1, 2)
+    dropout_rate = self.attention_dropout if self.training else 0.0
+    # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+    # therefore the input hidden states gets silently casted in float32. Hence, we need
+    # cast them back in the correct dtype just to be sure everything works as expected.
+    # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+    # in fp32. (LlamaRMSNorm handles it correctly)
+    input_dtype = query_states.dtype
+    if input_dtype == torch.float32:
+        if torch.is_autocast_enabled():
+            target_dtype = torch.get_autocast_gpu_dtype()
+        # Handle the case where the model is quantized
+        elif hasattr(self.config, "_pre_quantization_dtype"):
+            target_dtype = self.config._pre_quantization_dtype
+        else:
+            target_dtype = self.q_proj.weight.dtype
+        logger.warning_once(
+            f"The input hidden states seems to be silently casted in float32, this might be related to"
+            f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+            f" {target_dtype}."
+        )
+        query_states = query_states.to(target_dtype)
+        key_states = key_states.to(target_dtype)
+        value_states = value_states.to(target_dtype)
+    attn_output = self._flash_attention_forward(
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        q_len,
+        dropout=dropout_rate,
+    )
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+    attn_output = self.o_proj(attn_output)
+    if not output_attentions:
+        attn_weights = None
+    return attn_output, attn_weights, past_key_value
+def prepare_inputs_for_generation_llama(
+    self,
+    input_ids,
+    past_key_values=None,
+    attention_mask=None,
+    inputs_embeds=None,
+    **kwargs,
+):
+    if past_key_values is None:  # [SnapKV]
+        for layer in self.model.layers:
+            layer.self_attn.kv_seq_len = 0
+    if past_key_values is not None:
+        if isinstance(past_key_values, Cache):
+            cache_length = past_key_values.get_seq_length()
+            past_length = past_key_values.seen_tokens
+            max_cache_length = past_key_values.get_max_length()
+        else:
+            # cache_length = past_length = past_key_values[0][0].shape[2]
+            # max_cache_length = None
+            cache_length = past_length = self.model.layers[0].self_attn.kv_seq_len
+            max_cache_length = None
+        # Keep only the unprocessed tokens:
+        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+        # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+        # input)
+        if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+        # input_ids based on the past_length.
+        elif past_length < input_ids.shape[1]:
+            input_ids = input_ids[:, past_length:]
+        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+        if (
+            max_cache_length is not None
+            and attention_mask is not None
+            and cache_length + input_ids.shape[1] > max_cache_length
+        ):
+            attention_mask = attention_mask[:, -max_cache_length:]
+    position_ids = kwargs.get("position_ids", None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = position_ids[:, -input_ids.shape[1] :]
+    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+    if inputs_embeds is not None and past_key_values is None:
+        model_inputs = {"inputs_embeds": inputs_embeds}
+    else:
+        model_inputs = {"input_ids": input_ids}
+    model_inputs.update(
+        {
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        }
+    )
+    return model_inputs
+llama_flash_attn2_forward_4_37 = llama_flash_attn2_forward
+prepare_inputs_for_generation_llama_4_37 = prepare_inputs_for_generation_llama
+@torch.no_grad()
+def rope_forward(self, x, seq_len):
+    # x: [bs, num_attention_heads, seq_len, head_size]
+    position_ids = torch.arange(seq_len, device=x.device).unsqueeze(0)
+    inv_freq_expanded = (
+        self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+    )
+    position_ids_expanded = position_ids[:, None, :].float()
+    # Force float32 since bfloat16 loses precision on long contexts
+    # See https://github.com/huggingface/transformers/pull/29285
+    device_type = x.device.type
+    device_type = (
+        device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+    )
+    with torch.autocast(device_type=device_type, enabled=False):
+        freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(
+            1, 2
+        )
+        emb = torch.cat((freqs, freqs), dim=-1)
+        cos = emb.cos()
+        sin = emb.sin()
+    return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+##################
+# perform qk calculation and get indices
+# this version will not update in inference mode
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(
+        batch, num_key_value_heads, n_rep, slen, head_dim
+    )
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+class SnapKVCluster:
+    def __init__(
+        self,
+        window_size=64,
+        max_capacity_prompt=256 + 64,
+        kernel_size=5,
+        pooling="avgpool",
+    ):
+        self.window_size = window_size
+        self.max_capacity_prompt = max_capacity_prompt
+        assert self.max_capacity_prompt - self.window_size > 0
+        self.kernel_size = kernel_size
+        self.pooling = pooling
+    def reset(
+        self,
+        window_size=64,
+        max_capacity_prompt=256 + 64,
+        kernel_size=5,
+        pooling="avgpool",
+    ):
+        self.window_size = window_size
+        self.max_capacity_prompt = max_capacity_prompt
+        assert self.max_capacity_prompt - self.window_size > 0
+        self.kernel_size = kernel_size
+        self.pooling = pooling
+    def update_kv(
+        self,
+        key_states,
+        query_states,
+        value_states,
+        attention_mask,
+        num_key_value_groups,
+    ):
+        # check if prefix phase
+        assert key_states.shape[-2] == query_states.shape[-2]
+        bsz, num_heads, q_len, head_dim = query_states.shape
+        if q_len < self.max_capacity_prompt:
+            return key_states, value_states
+        else:
+            attn_weights = torch.matmul(
+                query_states[..., -self.window_size :, :], key_states.transpose(2, 3)
+            ) / math.sqrt(head_dim)
+            mask = torch.full(
+                (self.window_size, self.window_size),
+                torch.finfo(attn_weights.dtype).min,
+                device=attn_weights.device,
+            )
+            mask_cond = torch.arange(mask.size(-1), device=attn_weights.device)
+            mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0)
+            mask = mask.to(attn_weights.device)
+            attention_mask = mask[None, None, :, :]
+            attn_weights[
+                :, :, -self.window_size :, -self.window_size :
+            ] += attention_mask
+            attn_weights = nn.functional.softmax(
+                attn_weights, dim=-1, dtype=torch.float32
+            ).to(query_states.dtype)
+            attn_weights_sum = attn_weights[
+                :, :, -self.window_size :, : -self.window_size
+            ].sum(dim=-2)
+            if self.pooling == "avgpool":
+                attn_cache = F.avg_pool1d(
+                    attn_weights_sum,
+                    kernel_size=self.kernel_size,
+                    padding=self.kernel_size // 2,
+                    stride=1,
+                )
+            elif self.pooling == "maxpool":
+                attn_cache = F.max_pool1d(
+                    attn_weights_sum,
+                    kernel_size=self.kernel_size,
+                    padding=self.kernel_size // 2,
+                    stride=1,
+                )
+            else:
+                raise ValueError("Pooling method not supported")
+            indices = attn_cache.topk(
+                self.max_capacity_prompt - self.window_size, dim=-1
+            ).indices
+            indices = indices.unsqueeze(-1).expand(-1, -1, -1, head_dim)
+            k_past_compress = key_states[:, :, : -self.window_size, :].gather(
+                dim=2, index=indices
+            )
+            v_past_compress = value_states[:, :, : -self.window_size, :].gather(
+                dim=2, index=indices
+            )
+            k_cur = key_states[:, :, -self.window_size :, :]
+            v_cur = value_states[:, :, -self.window_size :, :]
+            key_states = torch.cat([k_past_compress, k_cur], dim=2)
+            value_states = torch.cat([v_past_compress, v_cur], dim=2)
+            return key_states, value_states
+def init_snapkv(self):
+    if not hasattr(self, "kv_cluster"):
+        if not hasattr(self.config, "window_size"):
+            self.config.window_size = 64
+        if not hasattr(self.config, "max_capacity_prompt"):
+            self.config.max_capacity_prompt = 4096
+        if not hasattr(self.config, "kernel_size"):
+            self.config.kernel_size = 13
+        if not hasattr(self.config, "pooling"):
+            self.config.pooling = "avgpool"
+    self.kv_cluster = SnapKVCluster(
+        window_size=self.config.window_size,
+        max_capacity_prompt=self.config.max_capacity_prompt,
+        kernel_size=self.config.kernel_size,
+        pooling=self.config.pooling,
+    )
+############
+def check_version():
+    try:
+        transformers_version = version("transformers")
+    except Exception as e:
+        print(f"Transformers not installed: {e}")
+    return transformers_version
+def replace_llama():
+    transformers_version = check_version()
+    version_list = ["4.37"]
+    warning_flag = True
+    for version in version_list:
+        if version in transformers_version:
+            warning_flag = False
+            break
+    if warning_flag:
+        warnings.warn(
+            f"Transformers version {transformers_version} might not be compatible with SnapKV. SnapKV is tested with Transformers version {version_list}."
+        )
+    transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation = (
+        prepare_inputs_for_generation_llama_4_37
+    )
+    transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward = (
+        llama_flash_attn2_forward_4_37
+    )

minference/ops/block_sparse_flash_attention.py ADDED Viewed

	@@ -0,0 +1,464 @@

+import torch
+import numpy as np
+import triton
+import triton.language as tl
+import pycuda.autoprimaryctx
+from pycuda.compiler import SourceModule
+from flash_attn import flash_attn_varlen_func
+# @triton.autotune(
+#    configs=[
+#        triton.Config({}, num_stages=1, num_warps=4),
+#        triton.Config({}, num_stages=1, num_warps=8),
+#        triton.Config({}, num_stages=2, num_warps=4),
+#        triton.Config({}, num_stages=2, num_warps=8),
+#        triton.Config({}, num_stages=3, num_warps=4),
+#        triton.Config({}, num_stages=3, num_warps=8),
+#        triton.Config({}, num_stages=4, num_warps=4),
+#        triton.Config({}, num_stages=4, num_warps=8),
+#        triton.Config({}, num_stages=5, num_warps=4),
+#        triton.Config({}, num_stages=5, num_warps=8),
+#    ],
+#    key=['N_CTX'],
+# )
+@triton.jit
+def triton_block_sparse_attn_kernel(
+    Q, K, V, seqlens, sm_scale,
+    block_index,
+    Out,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vn, stride_vk,
+    stride_oz, stride_oh, stride_om, stride_ok,
+    Z, H, N_CTX,
+    NUM_ROWS, MAX_BLOCKS_PRE_ROW,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    dtype: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    seqlen = tl.load(seqlens + off_hz // H)
+    if start_m * BLOCK_M >= seqlen:
+        return
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
+    kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh
+    q_ptrs = Q + qo_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    k_ptrs = K + kv_offset + offs_d[:, None] * stride_kk
+    v_ptrs = V + kv_offset + offs_d[None, :] * stride_vk
+    o_ptrs = Out + qo_offset + offs_m[:, None] * stride_om + offs_d[None, :] * stride_ok
+    blocks_ptr = block_index + (off_hz * NUM_ROWS + start_m) * MAX_BLOCKS_PRE_ROW
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs)
+    q = (q * qk_scale).to(dtype)
+    # loop over k, v and update accumulator
+    m_mask = offs_m[:, None] < seqlen
+    block_count = tl.minimum((start_m + 1) * BLOCK_M // BLOCK_N, MAX_BLOCKS_PRE_ROW)
+    for sparse_block_idx in range(block_count):
+        real_block_idx = tl.load(blocks_ptr + sparse_block_idx)
+        start_n = real_block_idx * BLOCK_N
+        cols = start_n + offs_n
+        # -- load k, v --
+        k = tl.load(k_ptrs + cols[None, :] * stride_kn)
+        v = tl.load(v_ptrs + cols[:, None] * stride_vn)
+        # -- compute qk --
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        # if start_n + BLOCK_N < seqlen:
+        #     qk = tl.where(m_mask, qk, float("-inf"))
+        # else:
+        causal_mask = cols[None, :] <= offs_m[:, None]
+        qk = tl.where(m_mask & causal_mask, qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant --
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+    # write back O
+    acc /= l_i[:, None]
+    tl.store(o_ptrs, acc.to(dtype), mask=m_mask)
+def triton_block_sparse_forward(
+    q,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    k,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    v,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    seqlens,           # [BATCH, ]
+    block_index,       # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), MAX_BLOCKS_PRE_ROW]
+    sm_scale,
+    block_size_M=64,
+    block_size_N=64,
+) -> torch.Tensor:
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    assert Lk in {16, 32, 64, 128}
+    o = torch.zeros_like(q)
+    grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
+    dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
+    triton_block_sparse_attn_kernel[grid](
+        q, k, v, seqlens, sm_scale,
+        block_index,
+        o,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        q.shape[0], q.shape[1], q.shape[2],
+        block_index.shape[-2], block_index.shape[-1],
+        BLOCK_M=block_size_M, BLOCK_N=block_size_N,
+        BLOCK_DMODEL=Lk,
+        dtype=dtype,
+        num_warps=4, num_stages=2,
+    )
+    return o
+def torch_build_index(
+    query: torch.Tensor,     # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    key: torch.Tensor,       # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    top_k: int,
+    block_size_M: int = 64,
+    block_size_N: int = 64,
+):
+    batch_size, num_heads, context_size, head_dim = query.shape
+    query_pool = query.reshape((batch_size, num_heads, -1, block_size_M, head_dim)).mean(dim=-2)
+    key_pool = key.reshape((batch_size, num_heads, -1, block_size_N, head_dim)).mean(dim=-2)
+    arange_M = torch.arange(query_pool.shape[-2], dtype=torch.int32, device=query.device) * block_size_M
+    arange_N = torch.arange(key_pool.shape[-2], dtype=torch.int32, device=key.device) * block_size_N
+    p_pool = torch.einsum(f'bhmk, bhnk -> bhmn', query_pool, key_pool)
+    p_pool = p_pool.where(arange_M[None, None, :, None] >= arange_N[None, None, None, :], -torch.inf)
+    top_k = min(top_k, context_size // block_size_N)
+    return torch.topk(p_pool, top_k, dim=-1).indices.to(torch.int32).sort(dim=-1).values
+def make_causal_mask(seqlens, device, context_size):
+    batch_size = seqlens.shape[0]
+    arange = torch.arange(context_size, dtype=torch.int32, device=device)
+    causal_mask = arange[None, None, :, None] >= arange[None, None, None, :]
+    causal_mask = causal_mask.repeat((batch_size, 1, 1, 1))
+    for b, seqlen in enumerate(seqlens):
+        causal_mask[b, :, seqlen:, :] = False
+        causal_mask[b, :, :, seqlen:] = False
+    return causal_mask
+def make_block_mask(block_index, causal_mask, device, block_size_M=64, block_size_N=64):
+    batch_size, num_heads, num_rows, max_blocks_per_row = block_index.shape
+    context_size = causal_mask.shape[-1]
+    block_mask = torch.zeros((batch_size, num_heads, context_size, context_size), dtype=torch.bool, device=device)
+    for b in range(batch_size):
+        for h in range(num_heads):
+            for i in range(num_rows):
+                start_m = i * block_size_M
+                end_m = start_m + block_size_M
+                for j in range(max_blocks_per_row):
+                    real_j = block_index[b, h, i, j]
+                    start_n = real_j * block_size_N
+                    end_n = start_n + block_size_N
+                    block_mask[b, h, start_m:end_m, start_n:end_n] = True
+    block_mask.logical_and_(causal_mask)
+    return block_mask
+def plot_mask(mask, name, batch=0, head=0):
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    plt.figure(figsize=(16, 12))
+    plt.clf()
+    mask = mask[batch, head].cpu().numpy()
+    sns.heatmap(mask)
+    plt.savefig(name)
+@triton.jit
+def triton_dense_fwd_kernel(
+    Q, K, V, seqlens, sm_scale,
+    Out,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vn, stride_vk,
+    stride_oz, stride_oh, stride_om, stride_ok,
+    Z, H, N_CTX,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    dtype: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    seqlen = tl.load(seqlens + off_hz // H)
+    if start_m * BLOCK_M >= seqlen:
+        return
+    qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
+    kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + qo_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    K_block_ptr = tl.make_block_ptr(
+        base=K + kv_offset,
+        shape=(BLOCK_DMODEL, N_CTX),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + kv_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_vn, stride_vk),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(Q_block_ptr)
+    q = (q * qk_scale).to(dtype)
+    # loop over k, v and update accumulator
+    lo = 0
+    hi = (start_m + 1) * BLOCK_M
+    m_mask = offs_m[:, None] < seqlen
+    for start_n in range(lo, hi, BLOCK_N):
+        n_mask = (start_n + offs_n[None, :]) <= offs_m[:, None]
+        # -- load k, v --
+        k = tl.load(K_block_ptr)
+        v = tl.load(V_block_ptr)
+        # -- compute qk --
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.where(m_mask & n_mask, qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant --
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+        # update pointers
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+    # write back O
+    acc = tl.where(m_mask, acc / l_i[:, None], 0.0)
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + qo_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_om, stride_ok),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    tl.store(O_block_ptr, acc.to(dtype))
+def triton_dense_forward(q, k, v, seqlens, sm_scale, block_size_M=128, block_size_N=64) -> torch.Tensor:
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    assert Lk in {16, 32, 64, 128}
+    o = torch.zeros_like(q)
+    grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
+    num_warps = 4 if Lk <= 64 else 8  # 4
+    dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
+    triton_dense_fwd_kernel[grid](
+        q, k, v, seqlens, sm_scale,
+        o,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        q.shape[0], q.shape[1], q.shape[2],
+        BLOCK_M=block_size_M, BLOCK_N=block_size_N,
+        BLOCK_DMODEL=Lk,
+        dtype=dtype,
+        num_warps=num_warps, num_stages=4,
+    )
+    return o
+def flash_attn_forward(q, k, v, seqlens, sm_scale, context_size) -> torch.Tensor:
+    return flash_attn_varlen_func(
+        q,
+        k,
+        v,
+        cu_seqlens_q=seqlens,
+        cu_seqlens_k=seqlens,
+        max_seqlen_q=context_size,
+        max_seqlen_k=context_size,
+        dropout_p=0.0,
+        softmax_scale=sm_scale,
+        causal=True,
+    )
+def torch_forward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: torch.Tensor,
+    sm_scale: float,
+) -> torch.Tensor:
+    p = torch.einsum(f'bhmk, bhnk -> bhmn', query, key) * sm_scale
+    p = p.where(mask, -torch.inf)
+    p_max = p.max(-1, keepdim=True).values
+    p_max = torch.where(p_max < 0, 0.0, p_max)
+    p_exp = torch.exp(p - p_max)
+    s = p_exp / (p_exp.sum(-1, keepdim=True) + 1e-6)
+    out = torch.einsum(f'bhmn, bhnk -> bhmk', s, value)
+    return out
+def profile(fn, total_flops, tag, warmup=25, rep=100):
+    ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+    gflops = total_flops / ms * 1e-9
+    print(f'{tag}: {ms:.3f} ms | {gflops:.3f} GFLOP/s')
+def test_flash_attention(
+    seqlens=None,
+    dtype=torch.float16,
+    device="cuda",
+    torch_test=True,
+    batch_size=4,
+    num_heads=32,
+    context_size=1024,
+    head_dim=128,
+    top_k=5,
+    block_size_M=64,
+    block_size_N=64,
+):
+    print('========================================')
+    print(f'BATCH={batch_size}, N_CTX={context_size}, N_HEADS={num_heads}, D_HEAD={head_dim}')
+    q = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
+    k = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
+    v = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
+    if seqlens is None:
+        seqlens = torch.randint(context_size // 2, context_size, (batch_size, ), dtype=torch.int32, device=device)
+    else:
+        seqlens = torch.tensor(seqlens, dtype=torch.int32, device=device)
+    dense_mask_nnz = seqlens.to(torch.float32).square().sum().item() * num_heads / 2
+    sm_scale = head_dim ** -0.5
+    causal_mask = make_causal_mask(seqlens, device, context_size)
+    if torch_test:
+        ref_o_dense = torch_forward(q, k, v, causal_mask, sm_scale)
+    block_index = torch_build_index(q, k, top_k, block_size_M, block_size_N)
+    arange_M = torch.arange(block_index.shape[-2], device=device)
+    block_index_mask = arange_M[None, None, :, None] * block_size_M >= block_index * block_size_N
+    sparse_mask_nnz = block_index_mask.to(torch.float32).sum().item() * block_size_M * block_size_N
+    print(f'block mask sparsity: {1 - sparse_mask_nnz / dense_mask_nnz}')
+    torch_build_index_fn = lambda: torch_build_index(q, k, top_k, block_size_M, block_size_N)
+    profile(torch_build_index_fn, 0., 'torch-index')
+    if torch_test:
+        block_mask = make_block_mask(block_index, causal_mask, device, block_size_M, block_size_N)
+        ref_o_sparse = torch_forward(q, k, v, block_mask, sm_scale)
+    triton_dense_fn = lambda: triton_dense_forward(q, k, v, seqlens, sm_scale)
+    output = triton_dense_fn()
+    if torch_test:
+        torch.testing.assert_close(output, ref_o_dense, atol=1e-2, rtol=0)
+    profile(triton_dense_fn, 2. * head_dim * dense_mask_nnz, 'triton-dense')
+    triton_sparse_fn = lambda: triton_block_sparse_forward(q, k, v, seqlens, block_index, sm_scale, block_size_M, block_size_N)
+    output = triton_sparse_fn()
+    if torch_test:
+        torch.testing.assert_close(output, ref_o_sparse, atol=1e-2, rtol=0)
+    profile(triton_sparse_fn, 2. * head_dim * sparse_mask_nnz, 'triton-sparse')
+    q = q.swapaxes(1, 2).contiguous()
+    k = k.swapaxes(1, 2).contiguous()
+    v = v.swapaxes(1, 2).contiguous()
+    q = torch.concatenate([q[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
+    k = torch.concatenate([k[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
+    v = torch.concatenate([v[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
+    seqlens = torch.nn.functional.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0))
+    flash_fn = lambda: flash_attn_forward(q, k, v, seqlens, sm_scale, context_size)
+    output = flash_fn()
+    output = torch.stack([
+        torch.nn.functional.pad(
+            output[seqlens[i]:seqlens[i + 1], :, :],
+            (0, 0, 0, 0, 0, context_size + seqlens[i] - seqlens[i + 1])
+        )
+        for i in range(batch_size)
+    ]).swapaxes(1, 2).contiguous()
+    if torch_test:
+        torch.testing.assert_close(output, ref_o_dense, atol=1e-2, rtol=0)
+    profile(flash_fn, 2. * head_dim * dense_mask_nnz, 'flash-dense')
+    print('========================================\n')
+def block_sparse_flash_attention_forward(
+    query: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    key: torch.Tensor,    # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    value: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    top_k: int,
+    block_size_M: int = 64,
+    block_size_N: int = 64,
+):
+    batch_size, num_heads, context_size, head_dim = query.shape
+    pad = block_size_M - (query.shape[2] & (block_size_M - 1))
+    query = torch.nn.functional.pad(query, [0, 0, 0, pad, 0, 0, 0, 0])
+    key = torch.nn.functional.pad(key, [0, 0, 0, pad, 0, 0, 0, 0])
+    value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
+    seqlens = torch.tensor([context_size], dtype=torch.int32, device=query.device)
+    sm_scale = head_dim ** -0.5
+    block_index = torch_build_index(query, key, top_k, block_size_N, block_size_N)
+    out = triton_block_sparse_forward(query, key, value, seqlens, block_index, sm_scale, block_size_M, block_size_N)
+    return out[..., :context_size, :]

minference/ops/pit_sparse_flash_attention.py ADDED Viewed

	@@ -0,0 +1,740 @@

+import numpy as np
+import pycuda.autoprimaryctx
+import torch
+import triton
+import triton.language as tl
+from flash_attn import flash_attn_varlen_func
+from pycuda.compiler import SourceModule
+@triton.autotune(
+   configs=[
+       triton.Config({}, num_stages=1, num_warps=4),
+       triton.Config({}, num_stages=1, num_warps=8),
+       triton.Config({}, num_stages=2, num_warps=4),
+       triton.Config({}, num_stages=2, num_warps=8),
+       triton.Config({}, num_stages=3, num_warps=4),
+       triton.Config({}, num_stages=3, num_warps=8),
+       triton.Config({}, num_stages=4, num_warps=4),
+       triton.Config({}, num_stages=4, num_warps=8),
+       triton.Config({}, num_stages=5, num_warps=4),
+       triton.Config({}, num_stages=5, num_warps=8),
+   ],
+   key=['N_CTX'],
+)
+@triton.jit
+def triton_sparse_fwd_kernel(
+    Q, K, V, seqlens, sm_scale,
+    col_count, col_index,
+    Out,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vn, stride_vk,
+    stride_oz, stride_oh, stride_om, stride_ok,
+    Z, H, N_CTX,
+    NUM_ROWS, MAX_COLS_PRE_ROW,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    dtype: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    seqlen = tl.load(seqlens + off_hz // H)
+    if start_m * BLOCK_M >= seqlen:
+        return
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
+    kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh
+    q_ptrs = Q + qo_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    k_ptrs = K + kv_offset + offs_d[:, None] * stride_kk
+    v_ptrs = V + kv_offset + offs_d[None, :] * stride_vk
+    o_ptrs = Out + qo_offset + offs_m[:, None] * stride_om + offs_d[None, :] * stride_ok
+    num_cols = tl.load(col_count + off_hz * NUM_ROWS + start_m)
+    cols_ptr = col_index + (off_hz * NUM_ROWS + start_m) * MAX_COLS_PRE_ROW
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs)
+    q = (q * qk_scale).to(dtype)
+    # loop over k, v and update accumulator
+    m_mask = offs_m[:, None] < seqlen
+    split = tl.maximum(num_cols - BLOCK_N, 0) & ~(BLOCK_N - 1)
+    for start_n in range(0, split, BLOCK_N):
+        cols = tl.load(cols_ptr + start_n + offs_n)
+        # -- load k, v --
+        k = tl.load(k_ptrs + cols[None, :] * stride_kn)
+        v = tl.load(v_ptrs + cols[:, None] * stride_vn)
+        # -- compute qk --
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.where(m_mask, qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant --
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+    for start_n in range(split, num_cols, BLOCK_N):
+        n_mask = start_n + offs_n < num_cols
+        cols = tl.load(cols_ptr + start_n + offs_n, mask=n_mask, other=N_CTX - 1)
+        causal_mask = cols[None, :] <= offs_m[:, None]
+        # -- load k, v --
+        k = tl.load(k_ptrs + cols[None, :] * stride_kn)
+        v = tl.load(v_ptrs + cols[:, None] * stride_vn)
+        # -- compute qk --
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.where(m_mask & causal_mask, qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant --
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+    # write back O
+    acc = tl.where(m_mask, acc / l_i[:, None], 0.0)
+    tl.store(o_ptrs, acc.to(dtype), mask=m_mask)
+def triton_sparse_forward(
+    q,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    k,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    v,                 # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    seqlens,           # [BATCH, ]
+    col_count,         # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    col_index,         # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), MAX_COLS_PRE_ROW]
+    sm_scale,
+    block_size_M=64,
+    block_size_N=64,
+) -> torch.Tensor:
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    assert Lk in {16, 32, 64, 128}
+    o = torch.zeros_like(q)
+    grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
+    num_warps = 4 if (Lk <= 64 or block_size_M <= 64) else 8  # 4
+    dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
+    triton_sparse_fwd_kernel[grid](
+        q, k, v, seqlens, sm_scale,
+        col_count, col_index,
+        o,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        q.shape[0], q.shape[1], q.shape[2],
+        col_index.shape[-2], col_index.shape[-1],
+        BLOCK_M=block_size_M, BLOCK_N=block_size_N,
+        BLOCK_DMODEL=Lk,
+        dtype=dtype,
+        # num_warps=num_warps, num_stages=4,
+    )
+    return o
+def torch_build_index(seqlens, vertical_indexes, slash_indexes, block_size_M=64):
+    max_cols_per_row = (seqlens.max().item() + 3) & (-4)
+    batch_size, num_heads, NNZ_S = slash_indexes.shape
+    NNZ_V = vertical_indexes.shape[-1]
+    num_rows = triton.cdiv(max_cols_per_row, block_size_M)
+    max_cols_per_row = max_cols_per_row
+    col_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32)
+    col_index = torch.zeros((batch_size, num_heads, num_rows, max_cols_per_row), dtype=torch.int32)
+    for b in range(batch_size):
+        seqlen = seqlens[b]
+        for h in range(num_heads):
+            for m, start_m in enumerate(range(0, seqlen, block_size_M)):
+                end_m = start_m + block_size_M
+                tmp_col_count = 0
+                cursor, s, v = -1, 0, 0
+                v_idx = vertical_indexes[b, h, v].item()
+                while s < NNZ_S and slash_indexes[b, h, s] >= end_m:
+                    s += 1
+                if s < NNZ_S:
+                    s_idx = end_m - slash_indexes[b, h, s].item()
+                    s_range = min(s_idx, block_size_M)
+                else:
+                    s_idx = seqlen
+                    s_range = 0
+                while s_idx <= end_m and v_idx < end_m:
+                    if v_idx < s_idx:
+                        if v_idx < s_idx - s_range:
+                            col_index[b, h, m, tmp_col_count] = v_idx
+                            tmp_col_count += 1
+                        v += 1
+                        if v < NNZ_V:
+                            v_idx = vertical_indexes[b, h, v].item()
+                        else:
+                            break
+                    else:
+                        for idx in range(max(cursor, s_idx - s_range), min(s_idx, seqlen)):
+                            col_index[b, h, m, tmp_col_count] = idx
+                            tmp_col_count += 1
+                        cursor = s_idx
+                        s += 1
+                        if s < NNZ_S:
+                            s_idx = end_m - slash_indexes[b, h, s].item()
+                            s_range = min(s_idx, block_size_M)
+                        else:
+                            break
+                while s_idx <= end_m and s < NNZ_S:
+                    for idx in range(max(cursor, s_idx - s_range), min(s_idx, seqlen)):
+                        col_index[b, h, m, tmp_col_count] = idx
+                        tmp_col_count += 1
+                    cursor = s_idx
+                    s += 1
+                    if s < NNZ_S:
+                        s_idx = end_m - slash_indexes[b, h, s].item()
+                        s_range = min(s_idx, block_size_M)
+                    else:
+                        break
+                while v_idx < end_m and v < NNZ_V:
+                    if v_idx < s_idx - s_range:
+                        col_index[b, h, m, tmp_col_count] = v_idx
+                        tmp_col_count += 1
+                    v += 1
+                    if v < NNZ_V:
+                        v_idx = vertical_indexes[b, h, v].item()
+                    else:
+                        break
+                col_count[b, h, m] = tmp_col_count
+    return col_count.to(seqlens.device), col_index.to(seqlens.device)
+PYCUDA_BUILD_INDEX_KERNEL_CODE = '''\
+__device__ int min(int x, int y) {
+    return x < y ? x : y;
+}
+__device__ int max(int x, int y) {
+    return x > y ? x : y;
+}
+__device__ void save_list(int* output, int loop_start, int loop_end, int& offset) {
+    if (loop_start + 4 >= loop_end) {
+        for (int idx = loop_start; idx < loop_end; idx++, offset++) {
+            output[offset] = idx;
+        }
+        return;
+    }
+    int4 tmp_int4;
+    int int4_start = ((offset + 3) & (-4)) - offset + loop_start;
+    int int4_end = ((offset + loop_end - loop_start) & (-4)) - offset + loop_start;
+    for (int idx = loop_start; idx < int4_start; idx++, offset++) {
+        output[offset] = idx;
+    }
+    for (int idx = int4_start; idx < int4_end; idx += 4, offset += 4) {
+        tmp_int4.x = idx + 0;
+        tmp_int4.y = idx + 1;
+        tmp_int4.z = idx + 2;
+        tmp_int4.w = idx + 3;
+        (reinterpret_cast<int4*>(&output[offset]))[0] = tmp_int4;
+    }
+    for (int idx = int4_end; idx < loop_end; idx++, offset++) {
+        output[offset] = idx;
+    }
+}
+__global__ void PYCUDA_BUILD_INDEX_KERNEL(
+    const int* seqlens,           // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int* col_count,               // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* col_index,               // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), N_CTX]
+    int N_HEADS,
+    int N_CTX,
+    int BLOCK_SIZE_M,
+    int N_ROWS,
+    int NNZ_V,
+    int NNZ_S
+) {
+    const int batch_idx = blockIdx.y;
+    const int head_idx = blockIdx.x;
+    const int group_idx = blockIdx.z;
+    int seqlen = seqlens[batch_idx];
+    int block_idx_m = group_idx * blockDim.x + threadIdx.x;
+    int start_m = block_idx_m * BLOCK_SIZE_M;
+    if (start_m >= seqlen) {
+        return;
+    }
+    int end_m = start_m + BLOCK_SIZE_M;
+    vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
+    slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
+    int row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
+    col_count += row_offset;
+    col_index += row_offset * N_CTX;
+    int tmp_col_count = 0, cursor = -1, s = 0, v = 0;
+    int v_idx = vertical_indexes[v];
+    /*
+    int left = 0, right = NNZ_S - 1;
+    int tmp_s_idx = 0, target = end_m - 1;
+    s = (left + right) >> 1;
+    while (left + 1 < right) {
+        tmp_s_idx = slash_indexes[s];
+        if (tmp_s_idx > target) {
+            left = s;
+        } else if (tmp_s_idx < target) {
+            right = s;
+        } else {
+            break;
+        }
+        s = (left + right) >> 1;
+    }
+    */
+    while (s < NNZ_S && slash_indexes[s] >= end_m) s++;
+    int s_idx = (s < NNZ_S) ? (end_m - slash_indexes[s]) : seqlen;
+    int s_range = (s < NNZ_S) ? min(s_idx, BLOCK_SIZE_M) : 0;
+    while (s_idx <= end_m && v_idx < end_m) {
+        if (v_idx < s_idx) {
+            if (v_idx < s_idx - s_range) {
+                col_index[tmp_col_count] = v_idx;
+                tmp_col_count++;
+            }
+            v++;
+            if (v < NNZ_V) {
+                v_idx = vertical_indexes[v];
+            } else {
+                break;
+            }
+        } else {
+            save_list(col_index, max(cursor, s_idx - s_range), min(s_idx, seqlen), tmp_col_count);
+            cursor = s_idx;
+            s++;
+            if (s < NNZ_S) {
+                s_idx = end_m - slash_indexes[s];
+                s_range = min(s_idx, BLOCK_SIZE_M);
+            } else {
+                break;
+            }
+        }
+    }
+    while (s_idx <= end_m && s < NNZ_S) {
+        save_list(col_index, max(cursor, s_idx - s_range), min(s_idx, seqlen), tmp_col_count);
+        cursor = s_idx;
+        s++;
+        if (s < NNZ_S) {
+            s_idx = end_m - slash_indexes[s];
+            s_range = min(s_idx, BLOCK_SIZE_M);
+        } else {
+            break;
+        }
+    }
+    while (v_idx < end_m && v < NNZ_V) {
+        if (v_idx < s_idx - s_range) {
+            col_index[tmp_col_count] = v_idx;
+            tmp_col_count++;
+        }
+        v++;
+        if (v < NNZ_V) {
+            v_idx = vertical_indexes[v];
+        } else {
+            break;
+        }
+    }
+    col_count[0] = tmp_col_count;
+}
+'''
+PYCUDA_BUILD_INDEX_KERNEL = SourceModule(
+    PYCUDA_BUILD_INDEX_KERNEL_CODE,
+    options=['-std=c++14', '-O3'],
+).get_function(f'PYCUDA_BUILD_INDEX_KERNEL')
+def pycuda_build_index(seqlens, vertical_indexes, slash_indexes, block_size_M=64):
+    max_cols_per_row = (seqlens.max().item() + 3) & (-4)
+    batch_size, num_heads, NNZ_S = slash_indexes.shape
+    NNZ_V = vertical_indexes.shape[-1]
+    num_rows = triton.cdiv(max_cols_per_row, block_size_M)
+    max_cols_per_row = max_cols_per_row
+    col_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32, device=seqlens.device)
+    col_index = torch.zeros((batch_size, num_heads, num_rows, max_cols_per_row), dtype=torch.int32, device=seqlens.device)
+    num_threads = 64
+    PYCUDA_BUILD_INDEX_KERNEL(
+        seqlens, vertical_indexes, slash_indexes,
+        col_count, col_index,
+        np.int32(num_heads), np.int32(max_cols_per_row), np.int32(block_size_M), np.int32(num_rows),
+        np.int32(NNZ_V), np.int32(NNZ_S),
+        # grid=(triton.cdiv(num_rows, num_threads), N_HEADS, BATCH),
+        grid=(num_heads, batch_size, triton.cdiv(num_rows, num_threads)),
+        block=(num_threads, 1, 1),
+    )
+    return col_count, col_index
+def make_causal_mask(seqlens, device, context_size):
+    batch_size = seqlens.shape[0]
+    arange = torch.arange(context_size, dtype=torch.int32, device=device)
+    causal_mask = arange[None, None, :, None] >= arange[None, None, None, :]
+    causal_mask = causal_mask.repeat((batch_size, 1, 1, 1))
+    for b, seqlen in enumerate(seqlens):
+        causal_mask[b, :, seqlen:, :] = False
+        causal_mask[b, :, :, seqlen:] = False
+    return causal_mask
+def make_finegrained_mask(vertical_indexes, slash_indexes, causal_mask, device):
+    batch_size, num_heads, _ = vertical_indexes.shape
+    context_size = causal_mask.shape[-1]
+    arange = torch.arange(context_size, dtype=torch.int32, device=device)
+    sparse_mask = torch.zeros((batch_size, num_heads, context_size, context_size), dtype=torch.bool, device=device)
+    for b in range(batch_size):
+        for h in range(num_heads):
+            for vertical_index in vertical_indexes[b, h]:
+                sparse_mask[b, h, :, vertical_index] = True
+            for slash_index in slash_indexes[b, h]:
+                sparse_mask[b, h].logical_or_(arange[:, None] - arange[None, :] == slash_index)
+    sparse_mask.logical_and_(causal_mask)
+    return sparse_mask
+def make_block_mask(col_count, col_index, seqlens, causal_mask, device, block_size_M=64):
+    batch_size, num_heads, _ = col_count.shape
+    context_size = causal_mask.shape[-1]
+    block_mask = torch.zeros((batch_size, num_heads, context_size, context_size), dtype=torch.bool, device=device)
+    for b in range(batch_size):
+        for h in range(num_heads):
+            for m, start_m in enumerate(range(0, seqlens[b], block_size_M)):
+                end_m = start_m + block_size_M
+                for c in range(col_count[b, h, m]):
+                    block_mask[b, h, start_m:end_m, col_index[b, h, m, c]] = True
+    block_mask.logical_and_(causal_mask)
+    return block_mask
+def plot_mask(mask, name, batch=0, head=0):
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    plt.figure(figsize=(16, 12))
+    plt.clf()
+    mask = mask[batch, head].cpu().numpy()
+    sns.heatmap(mask)
+    plt.savefig(name)
+@triton.jit
+def triton_dense_fwd_kernel(
+    Q, K, V, seqlens, sm_scale,
+    Out,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vn, stride_vk,
+    stride_oz, stride_oh, stride_om, stride_ok,
+    Z, H, N_CTX,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    dtype: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    seqlen = tl.load(seqlens + off_hz // H)
+    if start_m * BLOCK_M >= seqlen:
+        return
+    qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
+    kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + qo_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    K_block_ptr = tl.make_block_ptr(
+        base=K + kv_offset,
+        shape=(BLOCK_DMODEL, N_CTX),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + kv_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_vn, stride_vk),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(Q_block_ptr)
+    q = (q * qk_scale).to(dtype)
+    # loop over k, v and update accumulator
+    lo = 0
+    hi = (start_m + 1) * BLOCK_M
+    m_mask = offs_m[:, None] < seqlen
+    for start_n in range(lo, hi, BLOCK_N):
+        n_mask = (start_n + offs_n[None, :]) <= offs_m[:, None]
+        # -- load k, v --
+        k = tl.load(K_block_ptr)
+        v = tl.load(V_block_ptr)
+        # -- compute qk --
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.where(m_mask & n_mask, qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant --
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+        # update pointers
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+    # write back O
+    acc = tl.where(m_mask, acc / l_i[:, None], 0.0)
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + qo_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_om, stride_ok),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    tl.store(O_block_ptr, acc.to(dtype), mask=m_mask)
+def triton_dense_forward(q, k, v, seqlens, sm_scale, block_size_M=128, block_size_N=64) -> torch.Tensor:
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    assert Lk in {16, 32, 64, 128}
+    o = torch.zeros_like(q)
+    grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
+    num_warps = 4 if Lk <= 64 else 8  # 4
+    dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
+    triton_dense_fwd_kernel[grid](
+        q, k, v, seqlens, sm_scale,
+        o,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        q.shape[0], q.shape[1], q.shape[2],
+        BLOCK_M=block_size_M, BLOCK_N=block_size_N,
+        BLOCK_DMODEL=Lk,
+        dtype=dtype,
+        num_warps=num_warps, num_stages=4,
+    )
+    return o
+def flash_attn_forward(q, k, v, seqlens, sm_scale, context_size) -> torch.Tensor:
+    return flash_attn_varlen_func(
+        q,
+        k,
+        v,
+        cu_seqlens_q=seqlens,
+        cu_seqlens_k=seqlens,
+        max_seqlen_q=context_size,
+        max_seqlen_k=context_size,
+        dropout_p=0.0,
+        softmax_scale=sm_scale,
+        causal=True,
+    )
+def torch_forward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: torch.Tensor,
+    sm_scale: float,
+) -> torch.Tensor:
+    p = torch.einsum(f'bhmk, bhnk -> bhmn', query, key) * sm_scale
+    p = p.where(mask, -torch.inf)
+    p_max = p.max(-1, keepdim=True).values
+    p_max = torch.where(p_max < 0, 0.0, p_max)
+    p_exp = torch.exp(p - p_max)
+    s = p_exp / (p_exp.sum(-1, keepdim=True) + 1e-6)
+    out = torch.einsum(f'bhmn, bhnk -> bhmk', s, value)
+    return out
+def profile(fn, total_flops, tag, warmup=25, rep=100):
+    ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+    gflops = total_flops / ms * 1e-9
+    print(f'{tag}: {ms:.3f} ms | {gflops:.3f} GFLOP/s')
+def test_flash_attention(
+    seqlens=None,
+    vertical_indexes=None,
+    slash_indexes=None,
+    dtype=torch.float16,
+    device="cuda",
+    torch_test=True,
+    batch_size=4,
+    num_heads=32,
+    context_size=1024,
+    head_dim=128,
+    sparsity=0.995,
+    block_size_M=64,
+    block_size_N=64,
+):
+    print('========================================')
+    print(f'BATCH={batch_size}, N_CTX={context_size}, N_HEADS={num_heads}, D_HEAD={head_dim}')
+    q = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
+    k = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
+    v = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
+    if seqlens is None:
+        seqlens = torch.randint(context_size // 2, context_size, (batch_size, ), dtype=torch.int32, device=device)
+    else:
+        seqlens = torch.tensor(seqlens, dtype=torch.int32, device=device)
+    dense_mask_nnz = seqlens.to(torch.float32).square().sum().item() * num_heads / 2
+    sm_scale = head_dim ** -0.5
+    causal_mask = make_causal_mask(seqlens, device, context_size)
+    if torch_test:
+        ref_o_dense = torch_forward(q, k, v, causal_mask, sm_scale)
+    if vertical_indexes is None or slash_indexes is None:
+        nnz = int((1 - sparsity) * context_size)
+        vertical_indexes = torch.stack([
+            torch.stack([
+                torch.randperm(seqlen, dtype=torch.int32, device=device)[:nnz].sort(descending=False)[0]
+                for _ in range(num_heads)
+            ])
+            for seqlen in seqlens
+        ])
+        slash_indexes = torch.concatenate([
+            torch.stack([
+                torch.stack([
+                    torch.randperm(seqlen - 1, dtype=torch.int32, device=device)[:nnz].sort(descending=True)[0] + 1
+                    for _ in range(num_heads)
+                ])
+                for seqlen in seqlens
+            ]),
+            torch.zeros((batch_size, num_heads, 1), dtype=torch.int32, device=device)
+        ], dim=-1)
+    col_count, col_index = pycuda_build_index(seqlens, vertical_indexes, slash_indexes, block_size_M)
+    if torch_test:
+        col_count_ref, col_index_ref = torch_build_index(seqlens, vertical_indexes, slash_indexes, block_size_M)
+        # import ipdb; ipdb.set_trace()
+        torch.testing.assert_close(col_count_ref, col_count)
+        torch.testing.assert_close(col_index_ref, col_index)
+    sparse_mask_nnz = col_count.to(torch.float32).sum().item() * block_size_M
+    print(f'block mask sparsity: {1 - sparse_mask_nnz / dense_mask_nnz}')
+    pycuda_build_index_fn = lambda: pycuda_build_index(seqlens, vertical_indexes, slash_indexes, block_size_M)
+    profile(pycuda_build_index_fn, 0., 'pycuda-index')
+    if torch_test:
+        finegrained_mask = make_finegrained_mask(vertical_indexes, slash_indexes, causal_mask, device)
+        block_mask = make_block_mask(col_count, col_index, seqlens, causal_mask, device, block_size_M)
+        # plot_mask(finegrained_mask, 'mask.png', 2, 26)
+        # plot_mask(block_mask, 'mask-1.png', 2, 26)
+        ref_o_sparse = torch_forward(q, k, v, block_mask, sm_scale)
+    triton_dense_fn = lambda: triton_dense_forward(q, k, v, seqlens, sm_scale)
+    output = triton_dense_fn()
+    if torch_test:
+        torch.testing.assert_close(output, ref_o_dense, atol=1e-2, rtol=0)
+    profile(triton_dense_fn, 2. * head_dim * dense_mask_nnz, 'triton-dense')
+    triton_sparse_fn = lambda: triton_sparse_forward(q, k, v, seqlens, col_count, col_index, sm_scale, block_size_M, block_size_N)
+    output = triton_sparse_fn()
+    if torch_test:
+        torch.testing.assert_close(output, ref_o_sparse, atol=1e-2, rtol=0)
+    profile(triton_sparse_fn, 2. * head_dim * sparse_mask_nnz, 'triton-sparse')
+    q = q.swapaxes(1, 2).contiguous()
+    k = k.swapaxes(1, 2).contiguous()
+    v = v.swapaxes(1, 2).contiguous()
+    q = torch.concatenate([q[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
+    k = torch.concatenate([k[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
+    v = torch.concatenate([v[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
+    seqlens = torch.nn.functional.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0))
+    flash_fn = lambda: flash_attn_forward(q, k, v, seqlens, sm_scale, context_size)
+    output = flash_fn()
+    output = torch.stack([
+        torch.nn.functional.pad(
+            output[seqlens[i]:seqlens[i + 1], :, :],
+            (0, 0, 0, 0, 0, context_size + seqlens[i] - seqlens[i + 1])
+        )
+        for i in range(batch_size)
+    ]).swapaxes(1, 2).contiguous()
+    if torch_test:
+        torch.testing.assert_close(output, ref_o_dense, atol=1e-2, rtol=0)
+    profile(flash_fn, 2. * head_dim * dense_mask_nnz, 'flash-dense')
+    print('========================================\n')
+def pit_sparse_flash_attention_forward(
+    query: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    key: torch.Tensor,    # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    value: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    v_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
+    s_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
+    block_size_M: int = 64,
+    block_size_N: int = 64,
+):
+    q_len = query.shape[2]
+    pad = block_size_M - (query.shape[2] & (block_size_M - 1))
+    query = torch.nn.functional.pad(query, [0, 0, 0, pad, 0, 0, 0, 0])
+    key = torch.nn.functional.pad(key, [0, 0, 0, pad, 0, 0, 0, 0])
+    value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
+    batch_size, num_heads, context_size, head_dim = query.shape
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
+    seqlens = torch.tensor([context_size], dtype=torch.int32, device=query.device)
+    sm_scale = head_dim ** -0.5
+    col_count, col_index = pycuda_build_index(seqlens, v_idx, s_idx, block_size_M)
+    out = triton_sparse_forward(query, key, value, seqlens, col_count, col_index, sm_scale, block_size_M, block_size_N)[...,:q_len,:]
+    return out

minference/ops/pit_sparse_flash_attention_v2.py ADDED Viewed

	@@ -0,0 +1,735 @@

+import torch
+import numpy as np
+import math
+import triton
+import triton.language as tl
+import pycuda.autoprimaryctx
+from pycuda.compiler import SourceModule
+from flash_attn import flash_attn_varlen_func
+# @triton.autotune(
+#    configs=[
+#        triton.Config({}, num_stages=1, num_warps=4),
+#        triton.Config({}, num_stages=1, num_warps=8),
+#        triton.Config({}, num_stages=2, num_warps=4),
+#        triton.Config({}, num_stages=2, num_warps=8),
+#        triton.Config({}, num_stages=3, num_warps=4),
+#        triton.Config({}, num_stages=3, num_warps=8),
+#        triton.Config({}, num_stages=4, num_warps=4),
+#        triton.Config({}, num_stages=4, num_warps=8),
+#        triton.Config({}, num_stages=5, num_warps=4),
+#        triton.Config({}, num_stages=5, num_warps=8),
+#    ],
+#    key=['N_CTX'],
+# )
+@triton.jit
+def triton_sparse_fwd_kernel(
+    Q, K, V, seqlens, sm_scale,
+    block_count, block_offset, column_count, column_index,
+    Out,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vn, stride_vk,
+    stride_oz, stride_oh, stride_om, stride_ok,
+    Z, H, N_CTX,
+    NUM_ROWS, NNZ_S, NNZ_V,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    BLOCK_DMODEL: tl.constexpr,
+    dtype: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    seqlen = tl.load(seqlens + off_hz // H)
+    if start_m * BLOCK_M >= seqlen:
+        return
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_DMODEL)
+    qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
+    kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh
+    q_ptrs = Q + qo_offset + offs_m[:, None] * stride_qm + offs_d[None, :] * stride_qk
+    k_ptrs = K + kv_offset + offs_d[:, None] * stride_kk
+    v_ptrs = V + kv_offset + offs_d[None, :] * stride_vk
+    o_ptrs = Out + qo_offset + offs_m[:, None] * stride_om + offs_d[None, :] * stride_ok
+    num_blks = tl.load(block_count + off_hz * NUM_ROWS + start_m)
+    blks_ptr = block_offset + (off_hz * NUM_ROWS + start_m) * NNZ_S
+    num_cols = tl.load(column_count + off_hz * NUM_ROWS + start_m)
+    cols_ptr = column_index + (off_hz * NUM_ROWS + start_m) * NNZ_V
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(q_ptrs)
+    q = (q * qk_scale).to(dtype)
+    # loop over k, v and update accumulator
+    m_mask = offs_m[:, None] < seqlen
+    for block_index in range(num_blks):
+        start_n = tl.load(blks_ptr + block_index)
+        cols = start_n + offs_n
+        n_mask = cols < seqlen
+        # -- load k, v --
+        k = tl.load(k_ptrs + cols[None, :] * stride_kn, mask=n_mask[None, :], other=0.0)
+        v = tl.load(v_ptrs + cols[:, None] * stride_vn, mask=n_mask[:, None], other=0.0)
+        # -- compute qk --
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        causal_mask = cols[None, :] <= offs_m[:, None]
+        qk = tl.where(m_mask & causal_mask, qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant --
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+    for start_n in range(0, num_cols, BLOCK_N):
+        n_mask = start_n + offs_n < num_cols
+        cols = tl.load(cols_ptr + start_n + offs_n, mask=n_mask, other=0)
+        # -- load k, v --
+        k = tl.load(k_ptrs + cols[None, :] * stride_kn, mask=n_mask[None, :], other=0.0)
+        v = tl.load(v_ptrs + cols[:, None] * stride_vn, mask=n_mask[:, None], other=0.0)
+        # -- compute qk --
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.where(m_mask & n_mask, qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant --
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+    # write back O
+    acc /= l_i[:, None]
+    # acc = tl.where(m_mask, acc / l_i[:, None], 0.0)
+    tl.store(o_ptrs, acc.to(dtype), mask=m_mask)
+def triton_sparse_forward(
+    q: torch.Tensor,          # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    k: torch.Tensor,          # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    v: torch.Tensor,          # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    seqlens: torch.Tensor,    # [BATCH, ]
+    block_count: torch.Tensor,  # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    block_offset: torch.Tensor,  # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    column_count: torch.Tensor,  # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    column_index: torch.Tensor,  # [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    sm_scale: float,
+    block_size_M: int = 64,
+    block_size_N: int = 64,
+) -> torch.Tensor:
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    assert Lk in {16, 32, 64, 128}
+    o = torch.zeros_like(q)
+    grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
+    dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
+    triton_sparse_fwd_kernel[grid](
+        q, k, v, seqlens, sm_scale,
+        block_count, block_offset, column_count, column_index,
+        o,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        q.shape[0], q.shape[1], q.shape[2],
+        block_count.shape[-1], block_offset.shape[-1], column_index.shape[-1],
+        BLOCK_M=block_size_M, BLOCK_N=block_size_N,
+        BLOCK_DMODEL=Lk,
+        dtype=dtype,
+        num_warps=4, num_stages=2,
+    )
+    return o
+def torch_build_index(seqlens, vertical_indexes, slash_indexes, context_size, block_size_M=64, block_size_N=64):
+    device = seqlens.device
+    batch_size, num_heads, NNZ_S = slash_indexes.shape
+    NNZ_V = vertical_indexes.shape[-1]
+    num_rows = triton.cdiv(context_size, block_size_M)
+    block_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32)
+    block_offset = torch.zeros((batch_size, num_heads, num_rows, NNZ_S), dtype=torch.int32)
+    column_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32)
+    column_index = torch.zeros((batch_size, num_heads, num_rows, NNZ_V), dtype=torch.int32)
+    for b in range(batch_size):
+        seqlen = seqlens[b]
+        for h in range(num_heads):
+            for m, start_m in enumerate(range(0, seqlen, block_size_M)):
+                end_m = start_m + block_size_M
+                s = 0
+                while slash_indexes[b, h, s] >= end_m:
+                    s += 1
+                s_idx = max(end_m - slash_indexes[b, h, s], block_size_M)
+                s += 1
+                range_start = s_idx - block_size_M
+                range_end = s_idx
+                tmp_blocks = []
+                while s < NNZ_S:
+                    s_idx = max(end_m - slash_indexes[b, h, s], block_size_M)
+                    if s_idx > range_end + block_size_M:
+                        tmp_blocks += list(range(range_start, range_end, block_size_N))
+                        range_start = s_idx - block_size_M
+                        range_end = s_idx
+                    elif s_idx > range_end:
+                        range_end += block_size_M
+                    s += 1
+                tmp_blocks += list(range(range_start, range_end, block_size_N))
+                block_count[b, h, m] = len(tmp_blocks)
+                block_offset[b, h, m, :len(tmp_blocks)] = torch.tensor(tmp_blocks, dtype=block_offset.dtype)
+                tmp_columns = vertical_indexes[b, h].cpu().numpy().tolist()
+                tmp_columns = [col for col in tmp_columns if col < range_end]
+                for range_start in tmp_blocks:
+                    range_end = range_start + block_size_N
+                    tmp_columns = [col for col in tmp_columns if col < range_start or col >= range_end]
+                column_count[b, h, m] = len(tmp_columns)
+                column_index[b, h, m, :len(tmp_columns)] = torch.tensor(tmp_columns, dtype=block_offset.dtype)
+    return block_count.to(device), block_offset.to(device), column_count.to(device), column_index.to(device)
+PYCUDA_BUILD_INDEX_KERNEL_CODE = '''\
+__device__ int min(int x, int y) {
+    return x < y ? x : y;
+}
+__device__ int max(int x, int y) {
+    return x > y ? x : y;
+}
+__device__ void save_blocks(int* block_offset, int range_start, int range_end, int block_size, int& block_count) {
+    for (int idx = range_start; idx < range_end; idx += block_size) {
+        block_offset[block_count++] = idx;
+    }
+}
+__global__ void PYCUDA_BUILD_INDEX_KERNEL(
+    const int* seqlens,           // [BATCH, ]
+    const int* vertical_indexes,  // [BATCH, N_HEADS, NNZ_V]
+    const int* slash_indexes,     // [BATCH, N_HEADS, NNZ_S]
+    int* block_count,             // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* block_offset,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_S]
+    int* column_count,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M)]
+    int* column_index,            // [BATCH, N_HEADS, cdiv(N_CTX, BLOCK_SIZE_M), NNZ_V]
+    int N_HEADS,
+    int N_ROWS,
+    int BLOCK_SIZE_M,
+    int BLOCK_SIZE_N,
+    int NNZ_V,
+    int NNZ_S
+) {
+    const int batch_idx = blockIdx.y;
+    const int head_idx = blockIdx.x;
+    const int group_idx = blockIdx.z;
+    int seqlen = seqlens[batch_idx];
+    int block_idx_m = group_idx * blockDim.x + threadIdx.x;
+    int start_m = block_idx_m * BLOCK_SIZE_M;
+    if (start_m >= seqlen) {
+        return;
+    }
+    int end_m = start_m + BLOCK_SIZE_M;
+    vertical_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_V;
+    slash_indexes += (batch_idx * N_HEADS + head_idx) * NNZ_S;
+    int row_offset = (batch_idx * N_HEADS + head_idx) * N_ROWS + block_idx_m;
+    block_count += row_offset;
+    block_offset += row_offset * NNZ_S;
+    column_count += row_offset;
+    column_index += row_offset * NNZ_V;
+    int tmp_col_cnt = 0, tmp_blk_cnt = 0;
+    int s = 0, v = 0;
+    int v_idx = vertical_indexes[v++];
+    int s_idx = slash_indexes[s++];
+    while (s_idx >= end_m) {
+        s_idx = slash_indexes[s++];
+    }
+    s_idx = max(end_m - s_idx, BLOCK_SIZE_M);
+    int range_start = s_idx - BLOCK_SIZE_M, range_end = s_idx;
+    while (1) {
+        if (v_idx < range_end) {
+            if (v_idx < range_start) {
+                column_index[tmp_col_cnt++] = v_idx;
+            }
+            if (v < NNZ_V) {
+                v_idx = vertical_indexes[v++];
+            } else {
+                v_idx = end_m + BLOCK_SIZE_M;
+            }
+        } else {
+            if (s < NNZ_S) {
+                s_idx = max(end_m - slash_indexes[s++], BLOCK_SIZE_M);
+            } else {
+                save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt);
+                break;
+            }
+            if (s_idx > range_end + BLOCK_SIZE_M) {
+                save_blocks(block_offset, range_start, range_end, BLOCK_SIZE_N, tmp_blk_cnt);
+                range_start = s_idx - BLOCK_SIZE_M;
+                range_end = s_idx;
+            } else if (s_idx > range_end) {
+                range_end += BLOCK_SIZE_M;
+            }
+        }
+    }
+    block_count[0] = tmp_blk_cnt;
+    column_count[0] = tmp_col_cnt;
+}
+'''
+PYCUDA_BUILD_INDEX_KERNEL = SourceModule(
+    PYCUDA_BUILD_INDEX_KERNEL_CODE,
+    options=['-std=c++14', '-O3'],
+).get_function(f'PYCUDA_BUILD_INDEX_KERNEL')
+def pycuda_build_index(seqlens, vertical_indexes, slash_indexes, context_size, block_size_M=64, block_size_N=64):
+    batch_size, num_heads, NNZ_S = slash_indexes.shape
+    NNZ_V = vertical_indexes.shape[-1]
+    num_rows = triton.cdiv(context_size, block_size_M)
+    block_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32, device=seqlens.device)
+    block_offset = torch.zeros((batch_size, num_heads, num_rows, NNZ_S), dtype=torch.int32, device=seqlens.device)
+    column_count = torch.zeros((batch_size, num_heads, num_rows), dtype=torch.int32, device=seqlens.device)
+    column_index = torch.zeros((batch_size, num_heads, num_rows, NNZ_V), dtype=torch.int32, device=seqlens.device)
+    num_threads = 64
+    # import ipdb; ipdb.set_trace()
+    PYCUDA_BUILD_INDEX_KERNEL(
+        seqlens, vertical_indexes, slash_indexes,
+        block_count, block_offset, column_count, column_index,
+        np.int32(num_heads), np.int32(num_rows),
+        np.int32(block_size_M), np.int32(block_size_N),
+        np.int32(NNZ_V), np.int32(NNZ_S),
+        # grid=(triton.cdiv(num_rows, num_threads), N_HEADS, BATCH),
+        grid=(num_heads, batch_size, triton.cdiv(num_rows, num_threads)),
+        block=(num_threads, 1, 1),
+    )
+    return block_count, block_offset, column_count, column_index
+def make_causal_mask(seqlens, device, context_size):
+    batch_size = seqlens.shape[0]
+    arange = torch.arange(context_size, dtype=torch.int32, device=device)
+    causal_mask = arange[None, None, :, None] >= arange[None, None, None, :]
+    causal_mask = causal_mask.repeat((batch_size, 1, 1, 1))
+    for b, seqlen in enumerate(seqlens):
+        causal_mask[b, :, seqlen:, :] = False
+        causal_mask[b, :, :, seqlen:] = False
+    return causal_mask
+def make_finegrained_mask(vertical_indexes, slash_indexes, causal_mask, device):
+    batch_size, num_heads, _ = vertical_indexes.shape
+    context_size = causal_mask.shape[-1]
+    arange = torch.arange(context_size, dtype=torch.int32, device=device)
+    sparse_mask = torch.zeros((batch_size, num_heads, context_size, context_size), dtype=torch.bool, device=device)
+    for b in range(batch_size):
+        for h in range(num_heads):
+            for vertical_index in vertical_indexes[b, h]:
+                sparse_mask[b, h, :, vertical_index] = True
+            for slash_index in slash_indexes[b, h]:
+                sparse_mask[b, h].logical_or_(arange[:, None] - arange[None, :] == slash_index)
+    sparse_mask.logical_and_(causal_mask)
+    return sparse_mask
+def make_block_mask(
+    block_count: torch.Tensor,
+    block_offset: torch.Tensor,
+    column_count: torch.Tensor,
+    column_index: torch.Tensor,
+    seqlens: torch.Tensor,
+    causal_mask: torch.Tensor,
+    device: torch.device,
+    block_size_M: int = 64,
+    block_size_N: int = 64.
+):
+    batch_size, num_heads, _ = block_count.shape
+    context_size = causal_mask.shape[-1]
+    block_mask = torch.zeros((batch_size, num_heads, context_size, context_size), dtype=torch.bool, device=device)
+    for b in range(batch_size):
+        for h in range(num_heads):
+            for m, start_m in enumerate(range(0, seqlens[b], block_size_M)):
+                end_m = start_m + block_size_M
+                for col_idx in range(column_count[b, h, m]):
+                    block_mask[b, h, start_m:end_m, column_index[b, h, m, col_idx]] = True
+                for blk_idx in range(block_count[b, h, m]):
+                    blk_start = block_offset[b, h, m, blk_idx].item()
+                    blk_end = blk_start + block_size_N
+                    block_mask[b, h, start_m:end_m, blk_start:blk_end] = True
+    block_mask.logical_and_(causal_mask)
+    return block_mask
+def plot_mask(mask, name, batch=0, head=0):
+    import matplotlib.pyplot as plt
+    import seaborn as sns
+    plt.figure(figsize=(16, 12))
+    plt.clf()
+    mask = mask[batch, head].cpu().numpy()
+    sns.heatmap(mask)
+    plt.savefig(name)
+@triton.jit
+def triton_dense_fwd_kernel(
+    Q, K, V, seqlens, sm_scale,
+    Out,
+    stride_qz, stride_qh, stride_qm, stride_qk,
+    stride_kz, stride_kh, stride_kn, stride_kk,
+    stride_vz, stride_vh, stride_vn, stride_vk,
+    stride_oz, stride_oh, stride_om, stride_ok,
+    Z, H, N_CTX,
+    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    dtype: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    seqlen = tl.load(seqlens + off_hz // H)
+    if start_m * BLOCK_M >= seqlen:
+        return
+    qo_offset = (off_hz // H) * stride_qz + (off_hz % H) * stride_qh
+    kv_offset = (off_hz // H) * stride_kz + (off_hz % H) * stride_kh
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + qo_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    K_block_ptr = tl.make_block_ptr(
+        base=K + kv_offset,
+        shape=(BLOCK_DMODEL, N_CTX),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1)
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + kv_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_vn, stride_vk),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    # initialize pointer to m and l
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    # scale sm_scale by log_2(e) and use
+    # 2^x instead of exp in the loop because CSE and LICM
+    # don't work as expected with `exp` in the loop
+    qk_scale = sm_scale * 1.44269504
+    # load q: it will stay in SRAM throughout
+    q = tl.load(Q_block_ptr)
+    q = (q * qk_scale).to(dtype)
+    # loop over k, v and update accumulator
+    lo = 0
+    hi = (start_m + 1) * BLOCK_M
+    m_mask = offs_m[:, None] < seqlen
+    for start_n in range(lo, hi, BLOCK_N):
+        n_mask = (start_n + offs_n[None, :]) <= offs_m[:, None]
+        # -- load k, v --
+        k = tl.load(K_block_ptr)
+        v = tl.load(V_block_ptr)
+        # -- compute qk --
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk = tl.where(m_mask & n_mask, qk, float("-inf"))
+        qk += tl.dot(q, k)
+        # -- compute scaling constant --
+        m_i_new = tl.maximum(m_i, tl.max(qk, 1))
+        alpha = tl.math.exp2(m_i - m_i_new)
+        p = tl.math.exp2(qk - m_i_new[:, None])
+        # -- scale and update acc --
+        acc_scale = l_i * 0 + alpha  # workaround some compiler bug
+        acc *= acc_scale[:, None]
+        acc += tl.dot(p.to(dtype), v)
+        # -- update m_i and l_i --
+        l_i = l_i * alpha + tl.sum(p, 1)
+        m_i = m_i_new
+        # update pointers
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+    # write back O
+    acc = tl.where(m_mask, acc / l_i[:, None], 0.0)
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + qo_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_om, stride_ok),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0)
+    )
+    tl.store(O_block_ptr, acc.to(dtype))
+def triton_dense_forward(q, k, v, seqlens, sm_scale, block_size_M=128, block_size_N=64) -> torch.Tensor:
+    # shape constraints
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    assert Lk in {16, 32, 64, 128}
+    o = torch.zeros_like(q)
+    grid = (triton.cdiv(q.shape[2], block_size_M), q.shape[0] * q.shape[1], 1)
+    num_warps = 4 if Lk <= 64 else 8  # 4
+    dtype = tl.bfloat16 if q.dtype == torch.bfloat16 else tl.float16
+    triton_dense_fwd_kernel[grid](
+        q, k, v, seqlens, sm_scale,
+        o,
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),
+        q.shape[0], q.shape[1], q.shape[2],
+        BLOCK_M=block_size_M, BLOCK_N=block_size_N,
+        BLOCK_DMODEL=Lk,
+        dtype=dtype,
+        num_warps=num_warps, num_stages=4,
+    )
+    return o
+def flash_attn_forward(q, k, v, seqlens, sm_scale, context_size) -> torch.Tensor:
+    return flash_attn_varlen_func(
+        q,
+        k,
+        v,
+        cu_seqlens_q=seqlens,
+        cu_seqlens_k=seqlens,
+        max_seqlen_q=context_size,
+        max_seqlen_k=context_size,
+        dropout_p=0.0,
+        softmax_scale=sm_scale,
+        causal=True,
+    )
+def torch_forward(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    mask: torch.Tensor,
+    sm_scale: float,
+) -> torch.Tensor:
+    p = torch.einsum(f'bhmk, bhnk -> bhmn', query, key) * sm_scale
+    p = p.where(mask, -torch.inf)
+    p_max = p.max(-1, keepdim=True).values
+    p_max = torch.where(p_max < 0, 0.0, p_max)
+    p_exp = torch.exp(p - p_max)
+    s = p_exp / (p_exp.sum(-1, keepdim=True) + 1e-6)
+    out = torch.einsum(f'bhmn, bhnk -> bhmk', s, value)
+    return out
+def profile(fn, total_flops, tag, warmup=25, rep=100):
+    ms = triton.testing.do_bench(fn, warmup=warmup, rep=rep)
+    gflops = total_flops / ms * 1e-9
+    print(f'{tag}: {ms:.3f} ms | {gflops:.3f} GFLOP/s')
+def test_flash_attention(
+    query=None,
+    key=None,
+    value=None,
+    seqlens=None,
+    vertical_indexes=None,
+    slash_indexes=None,
+    dtype=torch.float16,
+    device="cuda",
+    torch_test=True,
+    batch_size=4,
+    num_heads=32,
+    context_size=2048,
+    head_dim=128,
+    nnz_v=100,
+    nnz_s=10,
+    block_size_M=64,
+    block_size_N=64,
+):
+    print('========================================')
+    if query is None and key is None and value is None:
+        q = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
+        k = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
+        v = torch.randn((batch_size, num_heads, context_size, head_dim), dtype=dtype, device=device)
+    else:
+        q = torch.tensor(query, dtype=dtype, device=device)
+        k = torch.tensor(key, dtype=dtype, device=device)
+        v = torch.tensor(value, dtype=dtype, device=device)
+        batch_size, num_heads, context_size, head_dim = q.shape
+    print(f'BATCH={batch_size}, N_CTX={context_size}, N_HEADS={num_heads}, D_HEAD={head_dim}')
+    if seqlens is None:
+        seqlens = torch.randint(context_size // 2, context_size, (batch_size, ), dtype=torch.int32, device=device)
+    else:
+        seqlens = torch.tensor(seqlens, dtype=torch.int32, device=device)
+    print(seqlens)
+    dense_mask_nnz = seqlens.to(torch.float32).square().sum().item() * num_heads / 2
+    sm_scale = head_dim ** -0.5
+    if torch_test:
+        causal_mask = make_causal_mask(seqlens, device, context_size)
+        ref_o_dense = torch_forward(q, k, v, causal_mask, sm_scale)
+    if vertical_indexes is None or slash_indexes is None:
+        vertical_indexes = torch.stack([
+            torch.stack([
+                torch.randperm(seqlen, dtype=torch.int32, device=device)[:nnz_v].sort(descending=False)[0]
+                for _ in range(num_heads)
+            ])
+            for seqlen in seqlens
+        ])
+        slash_indexes = torch.concatenate([
+            torch.stack([
+                torch.stack([
+                    torch.randperm(seqlen - 1, dtype=torch.int32, device=device)[:nnz_s - 1].sort(descending=True)[0] + 1
+                    for _ in range(num_heads)
+                ])
+                for seqlen in seqlens
+            ]),
+            torch.zeros((batch_size, num_heads, 1), dtype=torch.int32, device=device)
+        ], dim=-1)
+    pycuda_build_index_fn = lambda: pycuda_build_index(
+        seqlens, vertical_indexes, slash_indexes, context_size, block_size_M, block_size_N
+    )
+    indexes = pycuda_build_index_fn()
+    block_count, block_offset, column_count, column_index = indexes
+    if torch_test:
+        block_count_ref, block_offset_ref, column_count_ref, column_index_ref = torch_build_index(
+            seqlens, vertical_indexes, slash_indexes, context_size, block_size_M, block_size_N
+        )
+        torch.testing.assert_close(block_count_ref, block_count)
+        torch.testing.assert_close(block_offset_ref, block_offset)
+        torch.testing.assert_close(column_count_ref, column_count)
+        torch.testing.assert_close(column_index_ref, column_index)
+    sparse_mask_nnz = column_count.to(torch.float64).sum().item() * block_size_M + \
+        block_count.to(torch.float64).sum().item() * block_size_M * block_size_N
+    print(f'block mask sparsity: {1 - sparse_mask_nnz / dense_mask_nnz}')
+    pycuda_build_index_fn = lambda: pycuda_build_index(
+        seqlens, vertical_indexes, slash_indexes, context_size, block_size_M, block_size_N
+    )
+    profile(pycuda_build_index_fn, 0., 'pycuda-index')
+    if torch_test:
+        finegrained_mask = make_finegrained_mask(vertical_indexes, slash_indexes, causal_mask, device)
+        block_mask = make_block_mask(*indexes, seqlens, causal_mask, device, block_size_M, block_size_N)
+        plot_mask(finegrained_mask, 'mask.png', 0, 0)
+        plot_mask(block_mask, 'mask-1.png', 0, 0)
+        ref_o_sparse = torch_forward(q, k, v, block_mask, sm_scale)
+    triton_dense_fn = lambda: triton_dense_forward(q, k, v, seqlens, sm_scale)
+    output_triton_dense = triton_dense_fn()
+    if torch_test:
+        # Note: not correct for context_size % block_size_M != 0
+        torch.testing.assert_close(output_triton_dense, ref_o_dense, atol=1e-2, rtol=0)
+    profile(triton_dense_fn, 2. * head_dim * dense_mask_nnz, 'triton-dense')
+    triton_sparse_fn = lambda: triton_sparse_forward(q, k, v, seqlens, *indexes, sm_scale, block_size_M, block_size_N)
+    output_triton_sparse = triton_sparse_fn()
+    if torch_test:
+        torch.testing.assert_close(output_triton_sparse, ref_o_sparse, atol=1e-2, rtol=0)
+    profile(triton_sparse_fn, 2. * head_dim * sparse_mask_nnz, 'triton-sparse')
+    q = q.swapaxes(1, 2).contiguous()
+    k = k.swapaxes(1, 2).contiguous()
+    v = v.swapaxes(1, 2).contiguous()
+    q = torch.concatenate([q[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
+    k = torch.concatenate([k[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
+    v = torch.concatenate([v[i, :seqlen, :, :] for i, seqlen in enumerate(seqlens)])
+    seqlens = torch.nn.functional.pad(torch.cumsum(seqlens, dim=0, dtype=torch.int32), (1, 0))
+    flash_fn = lambda: flash_attn_forward(q, k, v, seqlens, sm_scale, context_size)
+    output_flash = flash_fn()
+    output_flash = torch.stack([
+        torch.nn.functional.pad(
+            output_flash[seqlens[i]:seqlens[i + 1], :, :],
+            (0, 0, 0, 0, 0, context_size + seqlens[i] - seqlens[i + 1])
+        )
+        for i in range(batch_size)
+    ]).swapaxes(1, 2).contiguous()
+    if torch_test:
+        torch.testing.assert_close(output_flash, ref_o_dense, atol=1e-2, rtol=0)
+    profile(flash_fn, 2. * head_dim * dense_mask_nnz, 'flash-dense')
+    print('========================================\n')
+    if torch_test and sparse_mask_nnz >= dense_mask_nnz:
+        torch.testing.assert_close(output_flash, output_triton_sparse, atol=1e-2, rtol=0)
+def pit_sparse_flash_attention_forward(
+    query: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    key: torch.Tensor,    # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    value: torch.Tensor,  # [BATCH, N_HEADS, N_CTX, D_HEAD]
+    v_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_V]
+    s_idx: torch.Tensor,  # [BATCH, N_HEADS, NNZ_S]
+    block_size_M: int = 64,
+    block_size_N: int = 64,
+):
+    batch_size, num_heads, context_size, head_dim = query.shape
+    pad = block_size_M - (context_size & (block_size_M - 1))
+    query = torch.nn.functional.pad(query, [0, 0, 0, pad, 0, 0, 0, 0])
+    key = torch.nn.functional.pad(key, [0, 0, 0, pad, 0, 0, 0, 0])
+    value = torch.nn.functional.pad(value, [0, 0, 0, pad, 0, 0, 0, 0])
+    if head_dim not in [16, 32, 64, 128, 256, 512]:
+        target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
+        query = torch.nn.functional.pad(query, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        key = torch.nn.functional.pad(key, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        value = torch.nn.functional.pad(value, [0, target_dim, 0, 0, 0, 0, 0, 0])
+    v_idx = v_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=False)[0]
+    s_idx = s_idx.to(torch.int32).reshape((batch_size, num_heads, -1)).sort(dim=-1, descending=True)[0]
+    seqlens = torch.tensor([context_size], dtype=torch.int32, device=query.device)
+    sm_scale = head_dim ** -0.5
+    block_count, block_offset, column_count, column_index = pycuda_build_index(
+        seqlens, v_idx, s_idx, context_size, block_size_M, block_size_N,
+    )
+    # if context_size > 700000:
+    #     import ipdb; ipdb.set_trace()
+    # dense_mask_nnz = seqlens.to(torch.float32).square().sum().item() * num_heads / 2
+    # sparse_mask_nnz = column_count.to(torch.float64).sum().item() * block_size_M + \
+    #     block_count.to(torch.float64).sum().item() * block_size_M * block_size_N
+    # print(f'block mask sparsity: {1 - sparse_mask_nnz / dense_mask_nnz}')
+    out = triton_sparse_forward(
+        query, key, value, seqlens,
+        block_count, block_offset, column_count, column_index,
+        sm_scale, block_size_M, block_size_N,
+    )
+    return out[..., :context_size, :head_dim]

minference/ops/streaming_kernel.py ADDED Viewed

	@@ -0,0 +1,763 @@

+"""
+Fused Attention
+===============
+This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao (https://tridao.me/publications/flash2/flash2.pdf)
+Credits: OpenAI kernel team
+Extra Credits:
+- Original flash attention paper (https://arxiv.org/abs/2205.14135)
+- Rabe and Staats (https://arxiv.org/pdf/2112.05682v2.pdf)
+"""
+import math
+import torch
+import triton
+import triton.language as tl
+_BLOCK_N=64
+_BLOCK_M=64
+@triton.jit
+def _attn_fwd_inner(acc, l_i, m_i, q,
+                    K_block_ptr, V_block_ptr,
+                    start_m, qk_scale, N_CTX,
+                    sliding_window_offset, sliding_window_size,
+                    BLOCK_M: tl.constexpr, BLOCK_DMODEL: tl.constexpr, BLOCK_N: tl.constexpr, SLIDING_WINDOW: tl.constexpr,
+                    IS_EVEN_M: tl.constexpr, IS_EVEN_N: tl.constexpr, COMPLEMENT_SLIDING_WINDOW: tl.constexpr
+                ):
+    # range of values handled by this stage
+    if SLIDING_WINDOW and not COMPLEMENT_SLIDING_WINDOW:
+        if COMPLEMENT_SLIDING_WINDOW:
+            lo = 0
+            hi = (((start_m + 1) * BLOCK_M + sliding_window_offset - sliding_window_size + BLOCK_N - 1) // BLOCK_N) * BLOCK_N
+        else:
+            lo = ((start_m * BLOCK_M + sliding_window_offset - sliding_window_size + 1) // BLOCK_N) * BLOCK_N
+            hi = ((((start_m + 1) * BLOCK_M - 1) + sliding_window_offset + BLOCK_N) // BLOCK_N) * BLOCK_N
+            if lo < 0:
+                lo = 0
+            if hi > N_CTX:
+                hi = N_CTX
+            # lo = 0
+            # hi = N_CTX
+            lo = tl.multiple_of(lo, BLOCK_N)
+            K_block_ptr = tl.advance(K_block_ptr, (0, lo))
+            V_block_ptr = tl.advance(V_block_ptr, (lo, 0))
+    else:
+        lo, hi = 0, N_CTX
+    # loop over k, v and update accumulator
+    for start_n in range(lo, hi, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # -- compute qk ----
+        if IS_EVEN_N:
+            k = tl.load(K_block_ptr)
+        else:
+            k = tl.load(K_block_ptr, boundary_check=(0, 1), padding_option="zero")
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk = qk * qk_scale
+        if SLIDING_WINDOW:
+            dist = tl.arange(0, BLOCK_M)[:, None] - tl.arange(0, BLOCK_N)[None, :] \
+                   + start_m * BLOCK_M - start_n + sliding_window_offset
+            if COMPLEMENT_SLIDING_WINDOW:
+                mask = (dist >= sliding_window_size)
+            else:
+                mask = (dist >= 0) & (dist < sliding_window_size)
+            qk = tl.where(mask, qk, float("-inf"))
+        if not IS_EVEN_N:
+            qk = tl.where(((tl.arange(0, BLOCK_N) + start_n) < N_CTX)[None, :], qk, float("-inf"))
+        m_ij = tl.maximum(m_i, tl.max(qk, 1))
+        qk = qk - m_ij[:, None]
+        p = tl.math.exp2(qk)
+        if SLIDING_WINDOW:
+            p = tl.where(mask, p, 0)
+        if not IS_EVEN_N:
+            p = tl.where(((tl.arange(0, BLOCK_N) + start_n) < N_CTX)[None, :], p, 0)
+        l_ij = tl.sum(p, 1)
+        # -- update m_i and l_i
+        tmp = m_i - m_ij
+        alpha_mask = (tmp != tmp) # check nan
+        alpha = tl.math.exp2(tmp)
+        alpha = tl.where(alpha_mask, 1., alpha)
+        l_i = l_i * alpha + l_ij
+        # -- update output accumulator --
+        acc = acc * alpha[:, None]
+        # update acc
+        if IS_EVEN_N:
+            v = tl.load(V_block_ptr)
+        else:
+            v = tl.load(V_block_ptr, boundary_check=(0, 1), padding_option="zero")
+        acc += tl.dot(p.to(v.dtype), v)
+        # update m_i and l_i
+        m_i = m_ij
+        V_block_ptr = tl.advance(V_block_ptr, (BLOCK_N, 0))
+        K_block_ptr = tl.advance(K_block_ptr, (0, BLOCK_N))
+    return acc, l_i, m_i
+@triton.heuristics(
+    {
+        "IS_EVEN_M": lambda args: args["N_CTX"] % args["BLOCK_M"] == 0,
+        "IS_EVEN_N": lambda args: args["NKV_CTX"] % args["BLOCK_N"] == 0,
+    }
+)
+@triton.jit
+def _attn_fwd(Q, K, V, sm_scale, M, Out, L,#
+              stride_qz, stride_qh, stride_qm, stride_qk,  #
+              stride_kz, stride_kh, stride_kn, stride_kk,  #
+              stride_vz, stride_vh, stride_vk, stride_vn,  #
+              stride_oz, stride_oh, stride_om, stride_on,  #
+              Z, H, H_KV, #
+              N_CTX,  #
+              ROUND_CTX,
+              NKV_CTX,
+              sliding_window_offset,
+              sliding_window_size,
+              IS_EVEN_M: tl.constexpr,
+              IS_EVEN_N: tl.constexpr,
+              BLOCK_M: tl.constexpr,  #
+              BLOCK_DMODEL: tl.constexpr,  #
+              BLOCK_N: tl.constexpr,  #
+              END: tl.constexpr,
+              INIT: tl.constexpr,
+              SLIDING_WINDOW: tl.constexpr,
+              COMPLEMENT_SLIDING_WINDOW: tl.constexpr
+            ):
+    start_m = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    off_hkv = off_h // (H//H_KV)
+    q_offset = off_z.to(tl.int64) * stride_qz + off_h.to(tl.int64) * stride_qh
+    k_offset = off_z.to(tl.int64) * stride_kz + off_hkv.to(tl.int64) * stride_kh
+    v_offset = off_z.to(tl.int64) * stride_vz + off_hkv.to(tl.int64) * stride_vh
+    o_offset = off_z.to(tl.int64) * stride_oz + off_h.to(tl.int64) * stride_oh
+    # block pointers
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + q_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    V_block_ptr = tl.make_block_ptr(
+        base=V + v_offset,
+        shape=(NKV_CTX, BLOCK_DMODEL),
+        strides=(stride_vk, stride_vn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_N, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(BLOCK_DMODEL, NKV_CTX),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, 0),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1),
+    )
+    O_block_ptr = tl.make_block_ptr(
+        base=Out + o_offset,
+        shape=(ROUND_CTX, BLOCK_DMODEL),
+        strides=(stride_om, stride_on),
+        offsets=(start_m * BLOCK_M, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    # initialize offsets
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    # initialize pointer to m and l
+    m_ptrs = M + off_hz * ROUND_CTX + offs_m
+    l_ptrs = L + off_hz * ROUND_CTX + offs_m
+    if INIT:
+        m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+        l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + 1.0
+        acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    else:
+        # don't have to check boundary for q len
+        m_i = tl.load(m_ptrs).to(tl.float32)
+        l_i = tl.load(l_ptrs).to(tl.float32)
+        acc = tl.load(O_block_ptr).to(tl.float32)
+    qk_scale = sm_scale
+    qk_scale *= 1.4426950408889634   # 1/log(2)
+    # load q: it will stay in SRAM throughout
+    if IS_EVEN_M:
+        q = tl.load(Q_block_ptr)
+    else:
+        q = tl.load(Q_block_ptr, boundary_check=(0, 1), padding_option="zero")
+    acc, l_i, m_i = _attn_fwd_inner(acc, l_i, m_i, q, K_block_ptr, V_block_ptr, #
+                                    start_m, qk_scale, NKV_CTX, #
+                                    sliding_window_offset, sliding_window_size,
+                                    BLOCK_M, BLOCK_DMODEL, BLOCK_N, SLIDING_WINDOW, IS_EVEN_M, IS_EVEN_N,
+                                    COMPLEMENT_SLIDING_WINDOW)
+    # epilogue
+    if (END):
+        m_i += tl.math.log2(l_i)
+        acc = acc / l_i[:, None]
+    else:
+        tl.store(l_ptrs, l_i)
+    tl.store(m_ptrs, m_i)
+    tl.store(O_block_ptr, acc.to(Out.type.element_ty))
+@triton.heuristics(
+    {
+        "IS_EVEN_M": lambda args: args["N_CTX"] % args["BLOCK_M"] == 0,
+        "IS_EVEN_N": lambda args: args["NKV_CTX"] % args["BLOCK_N"] == 0,
+    }
+)
+@triton.jit
+def _score_kernel(
+    Q, K, M, sm_scale, Out,
+    stride_qz, stride_qh, stride_qm, stride_qk,  #
+    stride_kz, stride_kh, stride_kn, stride_kk,  #
+    stride_oz, stride_oh, stride_on,
+    Z, H, H_KV, #
+    N_CTX,  #
+    ROUND_CTX,
+    NKV_CTX,
+    sliding_window_offset,
+    sliding_window_size,
+    SLIDING_WINDOW: tl.constexpr,
+    COMPLEMENT_SLIDING_WINDOW: tl.constexpr,
+    IS_EVEN_M: tl.constexpr,
+    IS_EVEN_N: tl.constexpr,
+    BLOCK_M: tl.constexpr,  #
+    BLOCK_DMODEL: tl.constexpr,  #
+    BLOCK_N: tl.constexpr,  #
+):
+    start_n = tl.program_id(0)
+    off_hz = tl.program_id(1)
+    off_z = off_hz // H
+    off_h = off_hz % H
+    off_hkv = off_h // (H//H_KV)
+    q_offset = off_z.to(tl.int64) * stride_qz + off_h.to(tl.int64) * stride_qh
+    k_offset = off_z.to(tl.int64) * stride_kz + off_hkv.to(tl.int64) * stride_kh
+    m_ptrs = M + off_hz * ROUND_CTX + tl.arange(0, BLOCK_M)
+    o = tl.zeros([BLOCK_M], dtype=tl.float32)
+    Q_block_ptr = tl.make_block_ptr(
+        base=Q + q_offset,
+        shape=(N_CTX, BLOCK_DMODEL),
+        strides=(stride_qm, stride_qk),
+        offsets=(0, 0),
+        block_shape=(BLOCK_M, BLOCK_DMODEL),
+        order=(1, 0),
+    )
+    K_block_ptr = tl.make_block_ptr(
+        base=K + k_offset,
+        shape=(BLOCK_DMODEL, NKV_CTX),
+        strides=(stride_kk, stride_kn),
+        offsets=(0, start_n * BLOCK_N),
+        block_shape=(BLOCK_DMODEL, BLOCK_N),
+        order=(0, 1),
+    )
+    if IS_EVEN_N:
+        k = tl.load(K_block_ptr)
+    else:
+        k = tl.load(K_block_ptr, boundary_check=(0, 1), padding_option="zero")
+    lo = 0
+    hi = ROUND_CTX
+    qk_scale = sm_scale
+    qk_scale *= 1.4426950408889634   # 1/log(2)
+    for start_m in range(lo, hi, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        if IS_EVEN_M:
+            q = tl.load(Q_block_ptr)
+        else:
+            q = tl.load(Q_block_ptr, boundary_check=(0,1), padding_option="zero")
+        m = tl.load(m_ptrs)
+        # calc qk
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k)
+        qk = qk * qk_scale
+        if SLIDING_WINDOW:
+            # dist = tl.arange(start_m, start_m + BLOCK_M)[:, None] \
+            #     - tl.arange(start_n * BLOCK_N, (start_n + 1) + BLOCK_N)[None, :] + sliding_window_offset
+            dist = tl.arange(0, BLOCK_M)[:, None] - tl.arange(0, BLOCK_N)[None, :] \
+                 + start_m - start_n * BLOCK_N + sliding_window_offset
+            if COMPLEMENT_SLIDING_WINDOW:
+                mask = (dist >= sliding_window_size)
+            else:
+                mask = (dist >= 0) & (dist < sliding_window_size)
+        qk = qk - m[:, None]
+        p = tl.math.exp2(qk) # (BLOCK_M, BLOCK_N)
+        if SLIDING_WINDOW:
+            p = tl.where(mask, p, 0)
+        if not IS_EVEN_N:
+            p = tl.where(
+                ((tl.arange(0, BLOCK_M) + start_m) < N_CTX)[:, None],
+                p, 0
+            )
+        o += tl.sum(p, axis=0)
+        Q_block_ptr = tl.advance(Q_block_ptr, offsets=(BLOCK_M, 0))
+        m_ptrs = m_ptrs + BLOCK_M
+    o_offset = off_z.to(tl.int64) * stride_oz + off_h.to(tl.int64) * stride_oh
+    o_range = tl.arange(0, BLOCK_N) + start_n * BLOCK_N # orange
+    o_ptrs = Out + o_offset + o_range
+    tl.store(o_ptrs, o.to(Out.type.element_ty), mask = o_range < NKV_CTX)
+def get_score(q, k, m, sliding_window, complement_sliding_window):
+    assert q.dim() == 4
+    assert k.dim() == 4
+    assert m.dim() == 3
+    assert q.shape[:2] == m.shape[:2]
+    N_CTX = q.size(-2)
+    NKV_CTX = k.size(-2)
+    ROUND_CTX = m.size(-1)
+    ret = torch.zeros(
+        (q.size(0), q.size(1), k.size(2)),
+        dtype=k.dtype, device=k.device
+    )
+    if sliding_window is not None:
+        sliding_window_offset, sliding_window_size = sliding_window
+    else:
+        sliding_window_offset, sliding_window_size = None, None
+    grid = lambda META: (
+        triton.cdiv(k.shape[2], META["BLOCK_N"]),
+        q.shape[0] * q.shape[1]
+    )
+    sm_scale = 1 / math.sqrt(q.size(-1))
+    global _BLOCK_N
+    global _BLOCK_M
+    try:
+        _score_kernel[grid](
+            q, k, m, sm_scale, ret,
+            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+            ret.stride(0), ret.stride(1), ret.stride(2),
+            q.size(0), q.size(1), k.size(1),
+            N_CTX, ROUND_CTX, NKV_CTX,
+            sliding_window_offset,
+            sliding_window_size,
+            SLIDING_WINDOW=(sliding_window is not None),
+            COMPLEMENT_SLIDING_WINDOW=complement_sliding_window,
+            BLOCK_M=_BLOCK_M,
+            BLOCK_N=_BLOCK_N,
+            BLOCK_DMODEL=q.size(-1)
+        )
+    except triton.OutOfResources as E:
+        from warnings import warn
+        _BLOCK_N = _BLOCK_N // 2
+        _BLOCK_M = _BLOCK_M // 2
+        warn(f"Triton Attention Output Resources. {E}\nUse smaller block size {_BLOCK_N}.")
+        _score_kernel[grid](
+            q, k, m, sm_scale, ret,
+            q.stride(0), q.stride(1), q.stride(2), q.stride(3),
+            k.stride(0), k.stride(1), k.stride(2), k.stride(3),
+            ret.stride(0), ret.stride(1), ret.stride(2),
+            q.size(0), q.size(1), k.size(1),
+            N_CTX, ROUND_CTX, NKV_CTX,
+            sliding_window_offset,
+            sliding_window_size,
+            SLIDING_WINDOW=(sliding_window is not None),
+            COMPLEMENT_SLIDING_WINDOW=complement_sliding_window,
+            BLOCK_M=_BLOCK_M,
+            BLOCK_N=_BLOCK_N,
+            BLOCK_DMODEL=q.size(-1)
+        )
+    return ret
+def _forward(
+    q, k, v, sm_scale,
+    o = None, m = None, l = None, end = False,
+    sliding_window=None, init=False,
+    complement_sliding_window=False
+):
+    Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
+    assert Lq == Lk and Lk == Lv
+    assert Lk in {16, 32, 64, 128}
+    q_round_len = math.ceil(q.shape[2] / 64) * 64
+    if sliding_window is not None:
+        sliding_window_offset, sliding_window_size = sliding_window
+    else:
+        sliding_window_offset, sliding_window_size = None, None
+    grid = lambda META: (
+        triton.cdiv(q.shape[2], META["BLOCK_M"]),
+        q.shape[0] * q.shape[1],
+    )
+    global _BLOCK_N
+    global _BLOCK_M
+    try:
+        _attn_fwd[grid](
+            q, k, v, sm_scale, m, o, l, #
+            q.stride(0), q.stride(1), q.stride(2), q.stride(3),  #
+            k.stride(0), k.stride(1), k.stride(2), k.stride(3),  #
+            v.stride(0), v.stride(1), v.stride(2), v.stride(3),  #
+            o.stride(0), o.stride(1), o.stride(2), o.stride(3),  #
+            q.shape[0], q.shape[1], k.shape[1], #
+            q.shape[2],  #
+            q_round_len,
+            k.shape[2],
+            sliding_window_offset,
+            sliding_window_size,
+            BLOCK_DMODEL=Lk,  #
+            END=end,
+            INIT=init,
+            BLOCK_M=_BLOCK_M,
+            BLOCK_N=_BLOCK_N,
+            SLIDING_WINDOW=(sliding_window is not None),
+            COMPLEMENT_SLIDING_WINDOW=complement_sliding_window,
+            num_warps=4,
+            num_stages=4
+        )
+    except triton.OutOfResources as E:
+        _BLOCK_N = _BLOCK_N // 2
+        _BLOCK_M = _BLOCK_M // 2
+        from warnings import warn
+        warn(f"Triton Attention Output Resources. {E}\nUse smaller block size {_BLOCK_N}.")
+        _attn_fwd[grid](
+            q, k, v, sm_scale, m, o, l, #
+            q.stride(0), q.stride(1), q.stride(2), q.stride(3),  #
+            k.stride(0), k.stride(1), k.stride(2), k.stride(3),  #
+            v.stride(0), v.stride(1), v.stride(2), v.stride(3),  #
+            o.stride(0), o.stride(1), o.stride(2), o.stride(3),  #
+            q.shape[0], q.shape[1], k.shape[1], #
+            q.shape[2],  #
+            q_round_len,
+            k.shape[2],
+            sliding_window_offset,
+            sliding_window_size,
+            BLOCK_DMODEL=Lk,  #
+            END=end,
+            INIT=init,
+            BLOCK_M=_BLOCK_M,
+            BLOCK_N=_BLOCK_N,
+            SLIDING_WINDOW=(sliding_window is not None),
+            COMPLEMENT_SLIDING_WINDOW=complement_sliding_window,
+            num_warps=4,
+            num_stages=4
+        )
+    if end:
+        o = o[:, :, :q.shape[2], :].contiguous().to(q.dtype)
+    return o, m, l
+class MultiStageDotProductionAttention:
+    def __init__(
+        self,
+        q_shape,
+        dtype,
+        device,
+    ):
+        self.q_shape = q_shape
+        self.dtype = dtype
+        self.device = device
+        self.end = False
+        self.ret = torch.zeros(
+            q_shape, dtype=dtype, device=device
+        )
+        self.score_list = []
+    def append(
+        self,
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+        sliding_window=None, complement_sliding_window: bool = False,
+        end=False, get_score=False,
+        *args, **kwargs
+    ):
+        raise NotImplementedError
+    def get_result(self):
+        return self.ret, self.score_list
+class TritonMultiStageDotProductionAttention(MultiStageDotProductionAttention):
+    def __init__(self, q_shape, dtype, device):
+        self.q_shape = q_shape
+        self.dtype = dtype
+        self.device = device
+        q_round_len = math.ceil(q_shape[2] / 64) * 64
+        o_shape = (q_shape[0], q_shape[1], q_round_len, q_shape[3])
+        m_shape = (q_shape[0], q_shape[1], q_round_len)
+        l_shape = (q_shape[0], q_shape[1], q_round_len)
+        self.o = torch.empty(o_shape, device=device, dtype=torch.float32)
+        self.m = torch.empty(m_shape, device=device, dtype=torch.float32)
+        self.l = torch.empty(l_shape, device=device, dtype=torch.float32)
+        self.q_list = []
+        self.k_list = []
+        self.sliding_window_list = []
+        self.complement_sliding_window_list = []
+        self.score_list = []
+        self.end = False
+        self.init = False
+    def finalize(self):
+        self.end = True
+        for q, k, sliding_window, comp in zip(self.q_list, self.k_list, self.sliding_window_list, self.complement_sliding_window_list):
+            if q is not None:
+                score = get_score(q, k, self.m, sliding_window, comp)
+                self.score_list.append(score)
+            else:
+                self.score_list.append(None)
+        self.ret = self.o
+    def append(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, end=False, get_score=False, sliding_window = None, complement_sliding_window: bool = False):
+        assert q.shape == self.q_shape
+        if isinstance(sliding_window, int):
+            sliding_window = (
+                k.shape[2] - q.shape[2], sliding_window
+            )
+        q = q.contiguous()
+        k = k.contiguous()
+        v = v.contiguous()
+        sm_scale = 1 / math.sqrt(q.shape[-1])
+        o, m, l = _forward(
+            q, k, v, sm_scale, self.o, self.m, self.l,
+            sliding_window=sliding_window, end=end, init=not self.init,
+            complement_sliding_window=complement_sliding_window
+        )
+        self.init = True
+        self.o = o
+        self.m = m
+        self.l = l
+        if get_score:
+            self.q_list.append(q)
+            self.k_list.append(k)
+            self.sliding_window_list.append(sliding_window)
+            self.complement_sliding_window_list.append(complement_sliding_window)
+        else:
+            self.q_list.append(None)
+            self.k_list.append(None)
+            self.sliding_window_list.append(None)
+            self.complement_sliding_window_list.append(None)
+        if end:
+            assert not self.end
+            self.finalize()
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def streaming_forward(
+    q, k, v,
+    n_init, n_local,
+):
+    # q,k,v should be tensors already equipped with RoPE
+    # k,v should already repeated to align with q.shape
+    assert q.dim() == 4 # (bsz, num_heads, seqlen, head_dim)
+    assert q.shape == k.shape == v.shape
+    head_dim = q.shape[-1]
+    if head_dim not in [16, 32, 64, 128, 256, 512]:
+        target_dim = 2 ** math.ceil(math.log2(head_dim)) - head_dim
+        q = torch.nn.functional.pad(q, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        k = torch.nn.functional.pad(k, [0, target_dim, 0, 0, 0, 0, 0, 0])
+        v = torch.nn.functional.pad(v, [0, target_dim, 0, 0, 0, 0, 0, 0])
+    q_len = q.size(2)
+    k_len = k.size(2)
+    attn = TritonMultiStageDotProductionAttention(q.shape, q.dtype, q.device)
+    if k_len > n_local:
+        init_k = k[:, :, :n_init, :].contiguous()
+        init_v = v[:, :, :n_init, :].contiguous()
+        attn.append(q, k, v, sliding_window=n_local)
+        attn.append(
+            q, init_k, init_v, end=True,
+            sliding_window=(k_len - q_len, n_local), complement_sliding_window=True
+        )
+    else:
+        attn.append(q, k, v, sliding_window=n_local, end=True)
+    score, _ = attn.get_result()
+    return score[...,:head_dim]
+def streaming_forward2(
+    q, k, v,
+    n_init, n_local,
+):
+    q_len = q.size(2)
+    k_len = k.size(2)
+    attn = TritonMultiStageDotProductionAttention(q.shape, q.dtype, q.device)
+    if k_len > n_local:
+        init_k = k[:, :, :n_init, :].contiguous()
+        init_v = v[:, :, :n_init, :].contiguous()
+    else:
+        init_k = torch.empty(
+            (k.size(0), k.size(1), 0, k.size(3)),
+            dtype=k.dtype, device=k.device
+        )
+        init_v = torch.empty(
+            (v.size(0), v.size(1), 0, v.size(3)),
+            dtype=v.dtype, device=v.device
+        )
+    attn.append(q, k, v, sliding_window=n_local)
+    attn.append(
+        q, init_k, init_v, end=True,
+        sliding_window=(k_len - q_len, n_local), complement_sliding_window=True
+    )
+    score, _ = attn.get_result()
+    return score
+def stream_llm_forward(n_local, n_init, *args, **kwargs):
+    Attn = TritonMultiStageDotProductionAttention
+    def forward(self, query : torch.Tensor,
+                    key_value : torch.Tensor,
+                    position_bias : torch.Tensor,
+                    use_cache: bool,
+                    past_key_value,
+                    project_q, project_k, project_v, attention_out,
+                    dim_head, num_heads, num_heads_kv
+    ):
+        batch_size = query.size(0)
+        len_q = query.size(1)
+        len_k = key_value.size(1)
+        h_q = project_q(query)             # (batch, len_q, num_heads * dim_head)
+        h_k = project_k(key_value)         # (batch, len_k, num_heads * dim_head)
+        h_v = project_v(key_value)         # (batch, len_k, num_heads * dim_head)
+        h_q = h_q.view(batch_size, len_q, num_heads, dim_head).permute(0, 2, 1, 3)   # (batch, num_heads, len_q, dim_head)
+        h_k = h_k.view(batch_size, len_k, num_heads_kv, dim_head).permute(0, 2, 1, 3)   # (batch, num_heads_kv, len_k, dim_head)
+        h_v = h_v.view(batch_size, len_k, num_heads_kv, dim_head).permute(0, 2, 1, 3)   # (batch, num_heads_kv, len_k, dim_head)
+        h_q = h_q.contiguous()      # (batch * num_heads, len_q, dim_head)
+        h_k = h_k.contiguous()      # (batch * num_heads, len_k, dim_head)
+        h_v = h_v.contiguous()      # (batch * num_heads, len_k, dim_head)
+        if past_key_value is not None:
+            h_k = torch.cat([past_key_value[0], h_k], dim=-2)
+            h_v = torch.cat([past_key_value[1], h_v], dim=-2)
+            len_k += past_key_value[2]
+        if use_cache:
+            if len_k <= n_local + n_init:
+                h_k_cache = h_k
+                h_v_cache = h_v
+            else:
+                h_k_cache = torch.cat([h_k[:,:, :n_init, :], h_k[:, :, max(0, h_k.size(-2) - n_local):, :]], dim=2)
+                h_v_cache = torch.cat([h_v[:,:, :n_init, :], h_v[:, :, max(0, h_k.size(-2) - n_local):, :]], dim=2)
+            current_key_value = (h_k_cache, h_v_cache, len_k)
+        else:
+            current_key_value = None
+        h_q_ = h_q
+        h_k_ = h_k
+        h_v_ = h_v
+        if len_q + n_local < h_k_.size(-2):
+            h_k_ = h_k_[:, :, h_k_.size(-2) - len_q - n_local:, :].contiguous().clone()
+            h_v_ = h_v_[:, :, h_v_.size(-2) - len_q - n_local:, :].contiguous().clone()
+        local_h_q, local_h_k = position_bias(h_q_, h_k_)
+        local_h_v = h_v_
+        if len_k > n_local:
+            init_h_q = position_bias.apply_rotary_pos_emb_one_angle(
+                h_q, n_local + n_init
+            )
+            init_h_k = position_bias.apply_rotary_pos_emb(
+                h_k[:, :, :n_init, :].contiguous(),
+                n_init, n_init, position_bias._cos_cached, position_bias._sin_cached
+            )
+            init_h_v = h_v[:, :, :n_init, :].contiguous()
+        else:
+            init_h_q = h_q
+            init_h_k = torch.empty(
+                (batch_size, num_heads_kv, 0, dim_head),
+                device=h_k.device,
+                dtype=h_k.dtype
+            )
+            init_h_v = torch.empty(
+                (batch_size, num_heads_kv, 0, dim_head),
+                device=h_v.device,
+                dtype=h_v.dtype
+            )
+        attn = Attn(local_h_q.shape, local_h_q.dtype, local_h_q.device)
+        attn.append(local_h_q, local_h_k, local_h_v, sliding_window=n_local)
+        attn.append(
+            init_h_q, init_h_k, init_h_v, end=True,
+            sliding_window=(len_k - len_q, n_local),
+            complement_sliding_window=True
+        )
+        score, _ = attn.get_result()
+        score = score.view(batch_size, num_heads, len_q, dim_head).permute(0, 2, 1, 3).contiguous() # (batch, len_q, num_heads, dim_head)
+        score = score.reshape(batch_size, len_q, num_heads * dim_head) # (batch, len_q, num_heads * dim_head)
+        score = attention_out(score)
+        if use_cache:
+            return score, current_key_value
+        else:
+            return score
+    return forward

minference/patch.py ADDED Viewed

	@@ -0,0 +1,1279 @@

+import json
+import torch
+import transformers
+from transformers.cache_utils import *
+from transformers.models.llama.modeling_llama import *
+from .modules.inf_llm import InfLLMGenerator, inf_llm_forward
+from .modules.minference_forward import (
+    gather_last_q_vertical_slash_topk_v4,
+    gather_last_q_vertical_slash_topk_vllm,
+    init_minference_parameters,
+    minference_forward,
+    minference_kv_cache_cpu_forward,
+    minference_vllm_forward,
+    minference_with_snapkv_forward,
+    search_pattern,
+    sum_all_diagonal_matrix,
+)
+from .ops.streaming_kernel import stream_llm_forward
+class RotaryEmbeddingESM(torch.nn.Module):
+    """
+    Rotary position embeddings based on those in
+    [RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
+    matrices which depend on their relative positions.
+    """
+    def __init__(
+        self,
+        dim: int,
+        base: Union[int, float] = 10000,
+        distance_scale: Union[int, float] = 1,
+    ):
+        super().__init__()
+        self.base = base
+        self.distance_scale = distance_scale
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = 1.0 / (
+            base ** (torch.arange(0, dim, 2, device="cuda", dtype=torch.float32) / dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self._seq_len_cached = -1
+        self._cos_cached = None
+        self._sin_cached = None
+    def rotate_half(self, x):
+        x1, x2 = x.chunk(2, dim=-1)
+        return torch.cat((-x2, x1), dim=-1)
+    def apply_rotary_pos_emb(self, x, length, right, cos, sin):
+        dtype = x.dtype
+        if cos.dim() == 2:
+            cos = cos[right - length : right, :]
+            sin = sin[right - length : right, :]
+        elif cos.dim() == 3:
+            cos = cos[:, right - length : right, :]
+            sin = sin[:, right - length : right, :]
+        elif cos.dim() == 4:
+            cos = cos[:, :, right - length : right, :]
+            sin = sin[:, :, right - length : right, :]
+        return ((x.float() * cos) + (self.rotate_half(x).float() * sin)).to(dtype)
+    def _update_cos_sin_tables(self, x, seq_dim):
+        seq_len = x.size(seq_dim)
+        if seq_len > self._seq_len_cached:
+            self._seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=x.device).type_as(self.inv_freq)
+            freqs = torch.outer(t * self.distance_scale, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            if x.dim() == 2:
+                self._cos_cached = emb.cos()
+                self._sin_cached = emb.sin()
+            elif x.dim() == 3:
+                self._cos_cached = emb.cos()[None, :, :]
+                self._sin_cached = emb.sin()[None, :, :]
+            elif x.dim() == 4:
+                self._cos_cached = emb.cos()[None, None, :, :]
+                self._sin_cached = emb.sin()[None, None, :, :]
+        return self._cos_cached, self._sin_cached
+    def _update_cos_sin_tables_len(self, seq_len, device, dim=None):
+        if seq_len > self._seq_len_cached:
+            if dim is None:
+                assert self._cos_cached is not None
+                dim = self._cos_cached.dim()
+            self._seq_len_cached = seq_len
+            t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
+            freqs = torch.outer(t * self.distance_scale, self.inv_freq)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            if dim == 2:
+                self._cos_cached = emb.cos()
+                self._sin_cached = emb.sin()
+            elif dim == 3:
+                self._cos_cached = emb.cos()[None, :, :]
+                self._sin_cached = emb.sin()[None, :, :]
+            elif dim == 4:
+                self._cos_cached = emb.cos()[None, None, :, :]
+                self._sin_cached = emb.sin()[None, None, :, :]
+        return self._cos_cached, self._sin_cached
+    def apply_rotary_pos_emb_one_angle(self, x: torch.Tensor, index):
+        dtype = x.dtype
+        cos, sin = self._update_cos_sin_tables_len(index, x.device)
+        if cos.dim() == 2:
+            cos = cos[index - 1 : index, :]
+            sin = sin[index - 1 : index, :]
+        elif cos.dim() == 3:
+            cos = cos[:, index - 1 : index, :]
+            sin = sin[:, index - 1 : index, :]
+        elif cos.dim() == 4:
+            cos = cos[:, :, index - 1 : index, :]
+            sin = sin[:, :, index - 1 : index, :]
+        return ((x.float() * cos) + (self.rotate_half(x).float() * sin)).to(dtype)
+    def forward(
+        self, q: torch.Tensor, k: torch.Tensor, seq_dim=-2
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        self._cos_cached, self._sin_cached = self._update_cos_sin_tables(
+            k, seq_dim=seq_dim
+        )
+        return (
+            self.apply_rotary_pos_emb(
+                q, q.size(seq_dim), k.size(seq_dim), self._cos_cached, self._sin_cached
+            ),
+            self.apply_rotary_pos_emb(
+                k, k.size(seq_dim), k.size(seq_dim), self._cos_cached, self._sin_cached
+            ),
+        )
+ATTN_FORWRAD = {
+    "streaming": stream_llm_forward,
+    "minference": minference_forward,
+    "inf_llm": inf_llm_forward,
+}
+def huggingface_forward(forward):
+    def hf_forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask=None,
+        position_ids=None,
+        past_key_value=None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        assert not output_attentions
+        ret = forward(
+            self,
+            hidden_states,
+            hidden_states,
+            position_ids,
+            use_cache,
+            past_key_value,
+            self.q_proj,
+            self.k_proj,
+            self.v_proj,
+            self.o_proj,
+            self.head_dim,
+            self.num_heads,
+            self.num_key_value_heads,
+        )
+        if use_cache:
+            o, pkv = ret
+        else:
+            o = ret
+            pkv = None
+        return o, None, pkv
+    return hf_forward
+def hf_437_prepare_inputs_for_generation(
+    self,
+    input_ids,
+    past_key_values=None,
+    attention_mask=None,
+    inputs_embeds=None,
+    **kwargs,
+):
+    if past_key_values is not None:
+        if isinstance(past_key_values, transformers.cache_utils.Cache):
+            cache_length = past_key_values.get_seq_length()
+            past_length = past_key_values.seen_tokens
+            max_cache_length = past_key_values.get_max_length()
+        else:
+            cache_length = past_length = past_key_values[0][0].shape[2]
+            max_cache_length = None
+        # Keep only the unprocessed tokens:
+        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+        # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+        # input)
+        if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+        # input_ids based on the past_length.
+        elif past_length < input_ids.shape[1]:
+            input_ids = input_ids[:, past_length:]
+        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+        if (
+            max_cache_length is not None
+            and attention_mask is not None
+            and cache_length + input_ids.shape[1] > max_cache_length
+        ):
+            attention_mask = attention_mask[:, -max_cache_length:]
+    position_ids = kwargs.get("position_ids", None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = position_ids[:, -input_ids.shape[1] :]
+    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+    if inputs_embeds is not None and past_key_values is None:
+        model_inputs = {"inputs_embeds": inputs_embeds}
+    else:
+        model_inputs = {"input_ids": input_ids}
+    model_inputs.update(
+        {
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        }
+    )
+    return model_inputs
+def prepare_inputs_for_generation(
+    self,
+    input_ids,
+    past_key_values=None,
+    attention_mask=None,
+    inputs_embeds=None,
+    cache_position=None,
+    **kwargs,
+):
+    # With static cache, the `past_key_values` is None
+    # TODO joao: standardize interface for the different Cache classes and remove of this if
+    has_static_cache = False
+    if past_key_values is None:
+        past_key_values = getattr(
+            getattr(self.model.layers[0], "self_attn", {}), "past_key_value", None
+        )
+        has_static_cache = past_key_values is not None
+    past_length = 0
+    if past_key_values is not None:
+        if isinstance(past_key_values, transformers.cache_utils.Cache):
+            past_length = (
+                cache_position[0]
+                if cache_position is not None
+                else past_key_values.get_seq_length()
+            )
+            max_cache_length = (
+                torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
+                if past_key_values.get_max_length() is not None
+                else None
+            )
+            cache_length = (
+                past_length
+                if max_cache_length is None
+                else torch.min(max_cache_length, past_length)
+            )
+        # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
+        else:
+            # cache_length = past_length = past_key_values[0][0].shape[2]
+            cache_length = past_length = cache_position[0]
+            max_cache_length = None
+        # Keep only the unprocessed tokens:
+        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+        # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+        # input)
+        if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+        # input_ids based on the past_length.
+        elif past_length < input_ids.shape[1]:
+            input_ids = input_ids[:, past_length:]
+        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+        if (
+            max_cache_length is not None
+            and attention_mask is not None
+            and cache_length + input_ids.shape[1] > max_cache_length
+        ):
+            attention_mask = attention_mask[:, -max_cache_length:]
+    position_ids = kwargs.get("position_ids", None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = position_ids[:, -input_ids.shape[1] :]
+    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+    if inputs_embeds is not None and past_key_values is None:
+        model_inputs = {"inputs_embeds": inputs_embeds}
+    else:
+        # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
+        # recompiles graphs as the stride of the inputs is a guard. Ref: https://github.com/huggingface/transformers/pull/29114
+        # TODO: use `next_tokens` directly instead.
+        model_inputs = {"input_ids": input_ids.contiguous()}
+    input_length = (
+        position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
+    )
+    if cache_position is None:
+        cache_position = torch.arange(
+            past_length, past_length + input_length, device=input_ids.device
+        )
+    else:
+        cache_position = cache_position[-input_length:]
+    if has_static_cache:
+        past_key_values = None
+    model_inputs.update(
+        {
+            "position_ids": position_ids,
+            "cache_position": cache_position,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        }
+    )
+    return model_inputs
+def prepare_inputs_for_generation_snapkv(
+    self,
+    input_ids,
+    past_key_values=None,
+    attention_mask=None,
+    inputs_embeds=None,
+    **kwargs,
+):
+    if past_key_values is None:  # [SnapKV]
+        for layer in self.model.layers:
+            layer.self_attn.kv_seq_len = 0
+    if past_key_values is not None:
+        if isinstance(past_key_values, Cache):
+            cache_length = past_key_values.get_seq_length()
+            past_length = past_key_values.seen_tokens
+            max_cache_length = past_key_values.get_max_length()
+        else:
+            # cache_length = past_length = past_key_values[0][0].shape[2]
+            # max_cache_length = None
+            cache_length = past_length = self.model.layers[0].self_attn.kv_seq_len
+            max_cache_length = None
+        # Keep only the unprocessed tokens:
+        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+        # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
+        # input)
+        if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+        # input_ids based on the past_length.
+        elif past_length < input_ids.shape[1]:
+            input_ids = input_ids[:, past_length:]
+        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+        if (
+            max_cache_length is not None
+            and attention_mask is not None
+            and cache_length + input_ids.shape[1] > max_cache_length
+        ):
+            attention_mask = attention_mask[:, -max_cache_length:]
+    position_ids = kwargs.get("position_ids", None)
+    if attention_mask is not None and position_ids is None:
+        # create position_ids on the fly for batch generation
+        position_ids = attention_mask.long().cumsum(-1) - 1
+        position_ids.masked_fill_(attention_mask == 0, 1)
+        if past_key_values:
+            position_ids = position_ids[:, -input_ids.shape[1] :]
+    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+    if inputs_embeds is not None and past_key_values is None:
+        model_inputs = {"inputs_embeds": inputs_embeds}
+    else:
+        model_inputs = {"input_ids": input_ids}
+    model_inputs.update(
+        {
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        }
+    )
+    return model_inputs
+def _prepare_decoder_attention_mask_inference(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    if past_key_values_length > 0 and attention_mask is not None:
+        attention_mask = torch.cat(
+            (
+                torch.full(
+                    (input_shape[0], past_key_values_length),
+                    True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                ),
+                attention_mask,
+            ),
+            dim=-1,
+        )
+    if attention_mask is not None and torch.all(attention_mask):
+        return None  # This uses the faster call when training with full samples
+    return attention_mask
+def forward_llama_decoder_layer(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: Optional[bool] = False,
+    use_cache: Optional[bool] = False,
+    padding_mask: Optional[torch.LongTensor] = None,
+) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+    """
+    Args:
+        hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+        attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+            `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+            returned tensors for more detail.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+            (see `past_key_values`).
+        past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+    """
+    residual = hidden_states.clone()
+    batch, seq_len, embed_dim = hidden_states.shape
+    for start_idx in range(0, seq_len, 32000):
+        end_idx = min(seq_len, start_idx + 32000)
+        hidden_states[:, start_idx:end_idx, :] = self.input_layernorm(
+            hidden_states[:, start_idx:end_idx, :]
+        )
+    # Self Attention
+    hidden_states, self_attn_weights, present_key_value = self.self_attn(
+        hidden_states=hidden_states,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_value=past_key_value,
+        output_attentions=output_attentions,
+        use_cache=use_cache,
+        padding_mask=padding_mask,
+    )
+    hidden_states = residual + hidden_states
+    # Fully Connected
+    for start_idx in range(0, seq_len, 32000):
+        end_idx = min(seq_len, start_idx + 32000)
+        part_hidden_states = hidden_states[:, start_idx:end_idx, :].clone()
+        part_hidden_states = self.post_attention_layernorm(part_hidden_states)
+        part_hidden_states = self.mlp(part_hidden_states)
+        hidden_states[:, start_idx:end_idx, :] += part_hidden_states
+    outputs = (hidden_states,)
+    if output_attentions:
+        outputs += (self_attn_weights,)
+    if use_cache:
+        outputs += (present_key_value,)
+    return outputs
+def forward_llama_model(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, BaseModelOutputWithPast]:
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+    use_cache = use_cache if use_cache is not None else self.config.use_cache
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+    # retrieve input_ids and inputs_embeds
+    if input_ids is not None and inputs_embeds is not None:
+        raise ValueError(
+            "You cannot specify both input_ids and inputs_embeds at the same time"
+        )
+    elif input_ids is not None:
+        batch_size, seq_length = input_ids.shape[:2]
+    elif inputs_embeds is not None:
+        batch_size, seq_length = inputs_embeds.shape[:2]
+    else:
+        raise ValueError("You have to specify either input_ids or inputs_embeds")
+    if self.gradient_checkpointing and self.training:
+        if use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+            )
+            use_cache = False
+    seq_length_with_past = seq_length
+    past_key_values_length = 0
+    if use_cache:
+        use_legacy_cache = not isinstance(past_key_values, Cache)
+        if use_legacy_cache:
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        past_key_values_length = past_key_values.get_usable_length(seq_length)
+        seq_length_with_past = seq_length_with_past + past_key_values_length
+    if position_ids is None:
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        position_ids = torch.arange(
+            past_key_values_length,
+            seq_length + past_key_values_length,
+            dtype=torch.long,
+            device=device,
+        )
+        position_ids = position_ids.unsqueeze(0)
+    if inputs_embeds is None:
+        inputs_embeds = self.embed_tokens(input_ids)
+    if attention_mask is None:
+        attention_mask = torch.ones(
+            (batch_size, seq_length_with_past),
+            dtype=torch.bool,
+            device=inputs_embeds.device,
+        )
+        padding_mask = None
+    else:
+        if 0 in attention_mask:
+            padding_mask = attention_mask
+        else:
+            padding_mask = None
+    attention_mask = self._prepare_decoder_attention_mask(
+        attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+    )
+    # embed positions
+    hidden_states = inputs_embeds
+    # decoder layers
+    all_hidden_states = () if output_hidden_states else None
+    all_self_attns = () if output_attentions else None
+    next_decoder_cache = None
+    for decoder_layer in self.layers:
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if self.gradient_checkpointing and self.training:
+            layer_outputs = self._gradient_checkpointing_func(
+                decoder_layer.__call__,
+                hidden_states,
+                attention_mask,
+                position_ids,
+                past_key_values,
+                output_attentions,
+                use_cache,
+            )
+        else:
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+        hidden_states = layer_outputs[0]
+        if use_cache:
+            next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+        if output_attentions:
+            all_self_attns += (layer_outputs[1],)
+    batch, seq_len, embed_dim = hidden_states.shape
+    for start_idx in range(0, seq_len, 32000):
+        end_idx = min(seq_len, start_idx + 32000)
+        hidden_states[:, start_idx:end_idx, :] = self.norm(
+            hidden_states[:, start_idx:end_idx, :]
+        )
+    # add hidden states from the last decoder layer
+    if output_hidden_states:
+        all_hidden_states += (hidden_states,)
+    next_cache = None
+    if use_cache:
+        next_cache = (
+            next_decoder_cache.to_legacy_cache()
+            if use_legacy_cache
+            else next_decoder_cache
+        )
+    if not return_dict:
+        return tuple(
+            v
+            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
+            if v is not None
+        )
+    return BaseModelOutputWithPast(
+        last_hidden_state=hidden_states,
+        past_key_values=next_cache,
+        hidden_states=all_hidden_states,
+        attentions=all_self_attns,
+    )
+def forward_llama_for_causal_lm(
+    self,
+    input_ids: torch.LongTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[List[torch.FloatTensor]] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, CausalLMOutputWithPast]:
+    # assert labels is not None
+    output_attentions = (
+        output_attentions
+        if output_attentions is not None
+        else self.config.output_attentions
+    )
+    output_hidden_states = (
+        output_hidden_states
+        if output_hidden_states is not None
+        else self.config.output_hidden_states
+    )
+    return_dict = (
+        return_dict if return_dict is not None else self.config.use_return_dict
+    )
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+    torch.cuda.empty_cache()
+    hidden_states = outputs[0]
+    if labels is not None:
+        loss_fct = CrossEntropyLoss(reduction="sum")
+        valid_seq_len = input_ids.shape[-1] - 1
+        valid_seq_len_slide_win = torch.sum(labels[:, 1:] >= 0).item()
+        # print("valid_seq_len_slide_win", valid_seq_len)
+        loss = 0.0
+        for start_idx in range(0, valid_seq_len, 32000):
+            end_idx = min(start_idx + 32000, valid_seq_len)
+            shift_logits = self.lm_head(
+                hidden_states[..., start_idx:end_idx, :]
+            ).float()
+            shift_labels = labels[..., start_idx + 1 : end_idx + 1].contiguous()
+            # Flatten the tokens
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss += loss_fct(shift_logits, shift_labels)
+        loss /= valid_seq_len_slide_win
+        logits = None
+    else:
+        if self.config.to_dict().get("is_ppl", False):
+            logits = self.lm_head(hidden_states)
+        else:
+            logits = self.lm_head(hidden_states[:, -1:]).float()
+        loss = None
+    return CausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+    )
+def minference_patch(model, config):
+    from transformers import LlamaForCausalLM
+    if config.kv_cache_cpu:
+        return minference_patch_kv_cache_cpu(model)
+    if config.use_snapkv:
+        return minference_patch_with_snapkv(model)
+    Attention = model.model.layers[0].self_attn.__class__
+    Model = model.model.__class__
+    DecoderLayer = model.model.layers[0].__class__
+    forward = minference_forward()
+    def update_module(m):
+        if isinstance(m, Attention):
+            m.init_minference_parameters = init_minference_parameters.__get__(
+                m, Attention
+            )
+            m.gather_last_q_vertical_slash_topk_v4 = (
+                gather_last_q_vertical_slash_topk_v4.__get__(m, Attention)
+            )
+            m.forward = forward.__get__(m, Attention)
+        if isinstance(m, DecoderLayer):
+            m.forward = forward_llama_decoder_layer.__get__(m, DecoderLayer)
+    model.apply(update_module)
+    model.prepare_inputs_for_generation = hf_437_prepare_inputs_for_generation.__get__(
+        model, model.__class__
+    )
+    model.model._use_sdpa = False
+    model.model._prepare_decoder_attention_mask = (
+        _prepare_decoder_attention_mask_inference.__get__(
+            model.model, model.model.__class__
+        )
+    )
+    model.model.forward = forward_llama_model.__get__(
+        model.model, model.model.__class__
+    )
+    model.forward = forward_llama_for_causal_lm.__get__(model, model.__class__)
+    print("Patched model for minference..")
+    return model
+def minference_patch_kv_cache_cpu(model):
+    from transformers import LlamaForCausalLM
+    transformers.cache_utils.DynamicCache.update = cpu_cache_update
+    transformers.cache_utils.DynamicCache.get = cpu_cache_get
+    Attention = model.model.layers[0].self_attn.__class__
+    Model = model.model.__class__
+    DecoderLayer = model.model.layers[0].__class__
+    forward = minference_kv_cache_cpu_forward()
+    def update_module(m):
+        if isinstance(m, Attention):
+            m.init_minference_parameters = init_minference_parameters.__get__(
+                m, Attention
+            )
+            m.gather_last_q_vertical_slash_topk_v4 = (
+                gather_last_q_vertical_slash_topk_v4.__get__(m, Attention)
+            )
+            m.forward = forward.__get__(m, Attention)
+        if isinstance(m, DecoderLayer):
+            m.forward = forward_llama_decoder_layer.__get__(m, DecoderLayer)
+    model.apply(update_module)
+    model.prepare_inputs_for_generation = hf_437_prepare_inputs_for_generation.__get__(
+        model, model.__class__
+    )
+    model.model._use_sdpa = False
+    model.model._prepare_decoder_attention_mask = (
+        _prepare_decoder_attention_mask_inference.__get__(
+            model.model, model.model.__class__
+        )
+    )
+    model.model.forward = forward_llama_model.__get__(
+        model.model, model.model.__class__
+    )
+    model.forward = forward_llama_for_causal_lm.__get__(model, model.__class__)
+    print("Patched model for MInference load KV Cache to CPU.")
+    return model
+def minference_patch_with_snapkv(model):
+    from transformers import LlamaForCausalLM
+    Attention = model.model.layers[0].self_attn.__class__
+    Model = model.model.__class__
+    DecoderLayer = model.model.layers[0].__class__
+    forward = minference_with_snapkv_forward()
+    def update_module(m):
+        if isinstance(m, Attention):
+            m.init_minference_parameters = init_minference_parameters.__get__(
+                m, Attention
+            )
+            m.gather_last_q_vertical_slash_topk_v4 = (
+                gather_last_q_vertical_slash_topk_v4.__get__(m, Attention)
+            )
+            m.forward = forward.__get__(m, Attention)
+        if isinstance(m, DecoderLayer):
+            m.forward = forward_llama_decoder_layer.__get__(m, DecoderLayer)
+    model.apply(update_module)
+    model.prepare_inputs_for_generation = prepare_inputs_for_generation_snapkv.__get__(
+        model, model.__class__
+    )
+    model.model._use_sdpa = False
+    model.model._prepare_decoder_attention_mask = (
+        _prepare_decoder_attention_mask_inference.__get__(
+            model.model, model.model.__class__
+        )
+    )
+    model.model.forward = forward_llama_model.__get__(
+        model.model, model.model.__class__
+    )
+    model.forward = forward_llama_for_causal_lm.__get__(model, model.__class__)
+    print("Patched model for minference with SanpKV..")
+    return model
+def llama_model_forward_vllm(
+    self,
+    input_ids: Optional[torch.Tensor],
+    positions: torch.Tensor,
+    kv_caches: List[torch.Tensor],
+    attn_metadata,
+    inputs_embeds: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    if inputs_embeds is not None:
+        hidden_states = inputs_embeds
+    else:
+        hidden_states = self.get_input_embeddings(input_ids)
+    residual = None
+    for i in range(len(self.layers)):
+        layer = self.layers[i]
+        hidden_states, residual = layer(
+            positions,
+            hidden_states,
+            kv_caches[i],
+            attn_metadata,
+            residual,
+            layer_idx=i,
+        )
+    hidden_states, _ = self.norm(hidden_states, residual)
+    return hidden_states
+def llama_layer_forward_vllm(
+    self,
+    positions: torch.Tensor,
+    hidden_states: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_metadata,
+    residual: Optional[torch.Tensor],
+    layer_idx: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # Self Attention
+    if residual is None:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+    else:
+        hidden_states, residual = self.input_layernorm(hidden_states, residual)
+    hidden_states = self.self_attn(
+        positions=positions,
+        hidden_states=hidden_states,
+        kv_cache=kv_cache,
+        attn_metadata=attn_metadata,
+        layer_idx=layer_idx,
+    )
+    # Fully Connected
+    hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+    hidden_states = self.mlp(hidden_states)
+    return hidden_states, residual
+def llama_attn_forward_vllm(
+    self,
+    positions: torch.Tensor,
+    hidden_states: torch.Tensor,
+    kv_cache: torch.Tensor,
+    attn_metadata,
+    layer_idx: int,
+) -> torch.Tensor:
+    qkv, _ = self.qkv_proj(hidden_states)
+    q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+    q, k = self.rotary_emb(positions, q, k)
+    attn_output = self.attn(q, k, v, kv_cache, attn_metadata, self.kv_scale, layer_idx)
+    output, _ = self.o_proj(attn_output)
+    return output
+def vllm_attn_forward(
+    self,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    kv_cache: Optional[torch.Tensor],
+    attn_metadata,
+    kv_scale: float = 1.0,
+    layer_idx: int = 0,
+) -> torch.Tensor:
+    return self.impl.forward(
+        query, key, value, kv_cache, attn_metadata, kv_scale, layer_idx
+    )
+def minference_patch_vllm(
+    llm,
+    config_file,
+):
+    from vllm.attention import Attention
+    from vllm.model_executor.models.llama import (
+        LlamaAttention,
+        LlamaDecoderLayer,
+        LlamaForCausalLM,
+        LlamaModel,
+    )
+    config = json.load(open(config_file))
+    attn_forward = minference_vllm_forward(config)
+    def update_module(m):
+        if isinstance(m, Attention):
+            m.forward = vllm_attn_forward.__get__(m, Attention)
+            m = m.impl
+            m_cls = m.__class__
+            m.gather_last_q_vertical_slash_topk_vllm = (
+                gather_last_q_vertical_slash_topk_vllm.__get__(m, m_cls)
+            )
+            m.forward = attn_forward.__get__(m, m_cls)
+        if isinstance(m, LlamaDecoderLayer):
+            m.forward = llama_layer_forward_vllm.__get__(m, LlamaDecoderLayer)
+        if isinstance(m, LlamaModel):
+            m.forward = llama_model_forward_vllm.__get__(m, LlamaModel)
+        if isinstance(m, LlamaAttention):
+            m.forward = llama_attn_forward_vllm.__get__(m, LlamaAttention)
+    llm.llm_engine.model_executor.driver_worker.model_runner.model.apply(update_module)
+    print("Patched model for minference with VLLM..")
+    return llm
+def patch_hf(
+    model,
+    attn_type: str = "inf_llm",
+    attn_kwargs: dict = {},
+    base=None,
+    distance_scale=None,
+    **kwargs,
+):
+    attn_kwargs.update(kwargs)
+    # This approach lacks scalability and will be refactored.
+    from transformers import LlamaForCausalLM, MistralForCausalLM, Qwen2ForCausalLM
+    from transformers.models.llama.modeling_llama import (
+        BaseModelOutputWithPast,
+        LlamaAttention,
+        LlamaModel,
+    )
+    from transformers.models.mistral.modeling_mistral import (
+        MistralAttention,
+        MistralModel,
+    )
+    from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention, Qwen2Model
+    def model_forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask=None,
+        position_ids=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        *args,
+        **kwargs,
+    ):
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            if hasattr(self, "config") and hasattr(self.config, "scale_emb"):
+                inputs_embeds = inputs_embeds * self.config.scale_emb
+        if use_cache:
+            pkv = tuple()
+        else:
+            pkv = None
+        hidden_states = inputs_embeds
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for i, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=self.position_bias,
+                past_key_value=(
+                    past_key_values[i] if past_key_values is not None else None
+                ),
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                _cache = layer_outputs[2 if output_attentions else 1]
+                pkv = pkv + (_cache,)
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        # hidden_states = self.norm(hidden_states)
+        for start_idx in range(0, hidden_states.size(1), 32000):
+            end_idx = min(hidden_states.size(1), start_idx + 32000)
+            hidden_states[:, start_idx:end_idx, :] = self.norm(
+                hidden_states[:, start_idx:end_idx, :]
+            )
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, pkv, all_hidden_states, all_self_attns]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=pkv,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    forward = huggingface_forward(ATTN_FORWRAD[attn_type](**attn_kwargs))
+    if isinstance(model, LlamaForCausalLM):
+        Attention = model.model.layers[0].self_attn.__class__
+        Model = model.model.__class__
+    elif isinstance(model, MistralForCausalLM):
+        Attention = model.model.layers[0].self_attn.__class__
+        Model = model.model.__class__
+    elif isinstance(model, Qwen2ForCausalLM):
+        Attention = model.model.layers[0].self_attn.__class__
+        Model = model.model.__class__
+    elif model.__class__.__name__ == "MiniCPMForCausalLM":
+        Attention = model.model.layers[0].self_attn.__class__
+        Model = model.model.__class__
+    elif model.__class__.__name__ == "Phi3ForCausalLM":
+        Attention = model.model.layers[0].self_attn.__class__
+        Model = model.model.__class__
+    else:
+        raise ValueError("Only supports llama, mistral and qwen2 models.")
+    hf_rope = model.model.layers[0].self_attn.rotary_emb
+    base = base if base is not None else hf_rope.base
+    distance_scale = distance_scale if distance_scale is not None else 1.0
+    rope = RotaryEmbeddingESM(hf_rope.dim, base, distance_scale)
+    model.model.position_bias = rope
+    model.model.hf_position_bias = hf_rope
+    def set_forward(m):
+        if isinstance(m, Attention):
+            m._old_forward = m.forward
+            m.forward = forward.__get__(m, Attention)
+    model.apply(set_forward)
+    model._old_prepare_inputs_for_generation = model.prepare_inputs_for_generation
+    model.prepare_inputs_for_generation = prepare_inputs_for_generation.__get__(
+        model, model.__class__
+    )
+    model.model._old_forward = model.model.forward
+    model.model.forward = model_forward.__get__(model.model, Model)
+    if attn_type == "inf_llm":
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model.config._name_or_path
+        )
+        model = InfLLMGenerator(model, tokenizer)
+    print("Patched model ...")
+    return model
+def fp8_cache_update(
+    self,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    layer_idx: int,
+    cache_kwargs: Optional[Dict[str, Any]] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+    Parameters:
+        key_states (`torch.Tensor`):
+            The new key states to cache.
+        value_states (`torch.Tensor`):
+            The new value states to cache.
+        layer_idx (`int`):
+            The index of the layer to cache the states for.
+        cache_kwargs (`Dict[str, Any]`, `optional`):
+            Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
+    Return:
+        A tuple containing the updated key and value states.
+    """
+    # Update the number of seen tokens
+    if layer_idx == 0:
+        self.seen_tokens += key_states.shape[-2]
+    # Update the cache
+    if len(self.key_cache) <= layer_idx:
+        self.key_cache.append(key_states.to(torch.float8_e5m2))
+        self.value_cache.append(value_states.to(torch.float8_e5m2))
+    else:
+        self.key_cache[layer_idx] = torch.cat(
+            [self.key_cache[layer_idx], key_states.to(torch.float8_e5m2)], dim=-2
+        )
+        self.value_cache[layer_idx] = torch.cat(
+            [self.value_cache[layer_idx], value_states.to(torch.float8_e5m2)], dim=-2
+        )
+    return self.key_cache[layer_idx].to(key_states.dtype), self.value_cache[
+        layer_idx
+    ].to(key_states.dtype)
+def cpu_cache_update(
+    self,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    layer_idx: int,
+    cache_kwargs: Optional[Dict[str, Any]] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if layer_idx == 0:
+        if "_seen_tokens" in self.__dict__:
+            self._seen_tokens += key_states.shape[-2]
+        else:
+            self.seen_tokens += key_states.shape[-2]
+    # Update the cache
+    if len(self.key_cache) <= layer_idx:
+        self.key_cache.append(key_states.cpu())
+        self.value_cache.append(value_states.cpu())
+    else:
+        self.key_cache[layer_idx] = torch.cat(
+            [self.key_cache[layer_idx], key_states.cpu()], dim=-2
+        )
+        self.value_cache[layer_idx] = torch.cat(
+            [self.value_cache[layer_idx], value_states.cpu()], dim=-2
+        )
+def cpu_cache_get(
+    self,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    layer_idx: int,
+    head_idx: int,
+    cache_kwargs: Optional[Dict[str, Any]] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    if layer_idx == 0:
+        if "_seen_tokens" in self.__dict__:
+            self._seen_tokens += key_states.shape[-2]
+        else:
+            self.seen_tokens += key_states.shape[-2]
+    # Update the cache
+    if len(self.key_cache) <= layer_idx:
+        return key_states, value_states
+    else:
+        key_states = torch.cat(
+            [self.key_cache[layer_idx][:, head_idx : head_idx + 1].cuda(), key_states],
+            dim=-2,
+        )
+        value_states = torch.cat(
+            [
+                self.value_cache[layer_idx][:, head_idx : head_idx + 1].cuda(),
+                value_states,
+            ],
+            dim=-2,
+        )
+        return key_states, value_states

minference/version.py ADDED Viewed

	@@ -0,0 +1,14 @@

+# Copyright (c) 2024 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+_MAJOR = "0"
+_MINOR = "1"
+# On master and in a nightly release the patch should be one ahead of the last
+# released build.
+_PATCH = "0"
+# This is mainly for nightly builds which have the suffix ".dev$DATE". See
+# https://semver.org/#is-v123-a-semantic-version for the semantics.
+_SUFFIX = "alpha.1"
+VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
+VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX)

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+flash_attn
+triton==2.1.0
+pycuda==2023.1
+accelerate
+transformers