Spaces:

prithivMLmods
/

Multimodal-OCR

Running on Zero

App Files Files Community

prithivMLmods commited on Feb 13

Commit

01dd8c8

verified ·

1 Parent(s): 4377887

update [kernels:flash-attn2] ✅

Browse files

Files changed (1) hide show

app.py +5 -5

app.py CHANGED Viewed

@@ -230,7 +230,7 @@ MODEL_ID_V = "nanonets/Nanonets-OCR2-3B"
 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V,
-    attn_implementation="kernels-community/flash-attn3",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -239,7 +239,7 @@ MODEL_ID_X = "prithivMLmods/Qwen2-VL-OCR-2B-Instruct"
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 model_x = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
-    attn_implementation="kernels-community/flash-attn3",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -248,7 +248,7 @@ MODEL_ID_A = "CohereForAI/aya-vision-8b"
 processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
 model_a = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID_A,
-    attn_implementation="kernels-community/flash-attn3",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -257,7 +257,7 @@ MODEL_ID_W = "allenai/olmOCR-7B-0725"
 processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
 model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_W,
-    attn_implementation="kernels-community/flash-attn3",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
@@ -399,7 +399,7 @@ with gr.Blocks() as demo:
                     )
                     gpu_duration_state = gr.Number(value=60, visible=False)
-            gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
     radioanimated_gpu_duration.change(
         fn=apply_gpu_duration,

 processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
 model_v = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_V,
+    attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
 model_x = Qwen2VLForConditionalGeneration.from_pretrained(
     MODEL_ID_X,
+    attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 processor_a = AutoProcessor.from_pretrained(MODEL_ID_A, trust_remote_code=True)
 model_a = AutoModelForImageTextToText.from_pretrained(
     MODEL_ID_A,
+    attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 processor_w = AutoProcessor.from_pretrained(MODEL_ID_W, trust_remote_code=True)
 model_w = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     MODEL_ID_W,
+    attn_implementation="kernels-community/flash-attn2",
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
                     )
                     gpu_duration_state = gr.Number(value=60, visible=False)
+           # gr.Markdown("*Note: Higher GPU duration allows for longer processing but consumes more GPU quota.*")
     radioanimated_gpu_duration.change(
         fn=apply_gpu_duration,