Spaces:
Running on Zero
Running on Zero
update [kernels:flash-attn2] ✅
Browse files
app.py
CHANGED
|
@@ -230,7 +230,7 @@ MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
|
|
| 230 |
processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
|
| 231 |
model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 232 |
MODEL_ID_V,
|
| 233 |
-
attn_implementation="kernels-community/flash-
|
| 234 |
trust_remote_code=True,
|
| 235 |
torch_dtype=torch.float16
|
| 236 |
).to(device).eval()
|
|
@@ -239,7 +239,7 @@ MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
|
|
| 239 |
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
|
| 240 |
model_x = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 241 |
MODEL_ID_X,
|
| 242 |
-
attn_implementation="kernels-community/flash-
|
| 243 |
trust_remote_code=True,
|
| 244 |
torch_dtype=torch.float16
|
| 245 |
).to(device).eval()
|
|
@@ -248,7 +248,7 @@ MODEL_ID_A = "CohereForAI/aya-vision-8b"
|
|
| 248 |
processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
|
| 249 |
model_a = AutoModelForImageTextToText.from_pretrained(
|
| 250 |
MODEL_ID_A,
|
| 251 |
-
attn_implementation="kernels-community/flash-
|
| 252 |
trust_remote_code=True,
|
| 253 |
torch_dtype=torch.float16
|
| 254 |
).to(device).eval()
|
|
@@ -257,7 +257,7 @@ MODEL_ID_W = "allenai/olmOCR-7B-0725"
|
|
| 257 |
processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
|
| 258 |
model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 259 |
MODEL_ID_W,
|
| 260 |
-
attn_implementation="kernels-community/flash-
|
| 261 |
trust_remote_code=True,
|
| 262 |
torch_dtype=torch.float16
|
| 263 |
).to(device).eval()
|
|
@@ -399,7 +399,7 @@ with gr.Blocks() as demo:
|
|
| 399 |
)
|
| 400 |
gpu_duration_state = gr.Number(value=60, visible=False)
|
| 401 |
|
| 402 |
-
|
| 403 |
|
| 404 |
radioanimated_gpu_duration.change(
|
| 405 |
fn=apply_gpu_duration,
|
|
|
|
| 230 |
processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
|
| 231 |
model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 232 |
MODEL_ID_V,
|
| 233 |
+
attn_implementation="kernels-community/flash-attn2",
|
| 234 |
trust_remote_code=True,
|
| 235 |
torch_dtype=torch.float16
|
| 236 |
).to(device).eval()
|
|
|
|
| 239 |
processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
|
| 240 |
model_x = Qwen2VLForConditionalGeneration.from_pretrained(
|
| 241 |
MODEL_ID_X,
|
| 242 |
+
attn_implementation="kernels-community/flash-attn2",
|
| 243 |
trust_remote_code=True,
|
| 244 |
torch_dtype=torch.float16
|
| 245 |
).to(device).eval()
|
|
|
|
| 248 |
processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
|
| 249 |
model_a = AutoModelForImageTextToText.from_pretrained(
|
| 250 |
MODEL_ID_A,
|
| 251 |
+
attn_implementation="kernels-community/flash-attn2",
|
| 252 |
trust_remote_code=True,
|
| 253 |
torch_dtype=torch.float16
|
| 254 |
).to(device).eval()
|
|
|
|
| 257 |
processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
|
| 258 |
model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 259 |
MODEL_ID_W,
|
| 260 |
+
attn_implementation="kernels-community/flash-attn2",
|
| 261 |
trust_remote_code=True,
|
| 262 |
torch_dtype=torch.float16
|
| 263 |
).to(device).eval()
|
|
|
|
| 399 |
)
|
| 400 |
gpu_duration_state = gr.Number(value=60, visible=False)
|
| 401 |
|
| 402 |
+
# gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
|
| 403 |
|
| 404 |
radioanimated_gpu_duration.change(
|
| 405 |
fn=apply_gpu_duration,
|