prithivMLmods commited on
Commit
01dd8c8
·
verified ·
1 Parent(s): 4377887

update [kernels:flash-attn2] ✅

Browse files
Files changed (1) hide show
  1. app.py +5 -5
app.py CHANGED
@@ -230,7 +230,7 @@ MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
230
  processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
231
  model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
232
  MODEL_ID_V,
233
- attn_implementation="kernels-community/flash-attn3",
234
  trust_remote_code=True,
235
  torch_dtype=torch.float16
236
  ).to(device).eval()
@@ -239,7 +239,7 @@ MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
239
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
240
  model_x = Qwen2VLForConditionalGeneration.from_pretrained(
241
  MODEL_ID_X,
242
- attn_implementation="kernels-community/flash-attn3",
243
  trust_remote_code=True,
244
  torch_dtype=torch.float16
245
  ).to(device).eval()
@@ -248,7 +248,7 @@ MODEL_ID_A = "CohereForAI/aya-vision-8b"
248
  processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
249
  model_a = AutoModelForImageTextToText.from_pretrained(
250
  MODEL_ID_A,
251
- attn_implementation="kernels-community/flash-attn3",
252
  trust_remote_code=True,
253
  torch_dtype=torch.float16
254
  ).to(device).eval()
@@ -257,7 +257,7 @@ MODEL_ID_W = "allenai/olmOCR-7B-0725"
257
  processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
258
  model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
259
  MODEL_ID_W,
260
- attn_implementation="kernels-community/flash-attn3",
261
  trust_remote_code=True,
262
  torch_dtype=torch.float16
263
  ).to(device).eval()
@@ -399,7 +399,7 @@ with gr.Blocks() as demo:
399
  )
400
  gpu_duration_state = gr.Number(value=60, visible=False)
401
 
402
- gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
403
 
404
  radioanimated_gpu_duration.change(
405
  fn=apply_gpu_duration,
 
230
  processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
231
  model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
232
  MODEL_ID_V,
233
+ attn_implementation="kernels-community/flash-attn2",
234
  trust_remote_code=True,
235
  torch_dtype=torch.float16
236
  ).to(device).eval()
 
239
  processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
240
  model_x = Qwen2VLForConditionalGeneration.from_pretrained(
241
  MODEL_ID_X,
242
+ attn_implementation="kernels-community/flash-attn2",
243
  trust_remote_code=True,
244
  torch_dtype=torch.float16
245
  ).to(device).eval()
 
248
  processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
249
  model_a = AutoModelForImageTextToText.from_pretrained(
250
  MODEL_ID_A,
251
+ attn_implementation="kernels-community/flash-attn2",
252
  trust_remote_code=True,
253
  torch_dtype=torch.float16
254
  ).to(device).eval()
 
257
  processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
258
  model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
259
  MODEL_ID_W,
260
+ attn_implementation="kernels-community/flash-attn2",
261
  trust_remote_code=True,
262
  torch_dtype=torch.float16
263
  ).to(device).eval()
 
399
  )
400
  gpu_duration_state = gr.Number(value=60, visible=False)
401
 
402
+ # gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
403
 
404
  radioanimated_gpu_duration.change(
405
  fn=apply_gpu_duration,