Skip to content

Instantly share code, notes, and snippets.

@7shi
Last active October 28, 2025 09:16
Show Gist options
  • Save 7shi/464ead6f0f8fea0ad2e12f52cf523936 to your computer and use it in GitHub Desktop.
Save 7shi/464ead6f0f8fea0ad2e12f52cf523936 to your computer and use it in GitHub Desktop.
DeepSeek OCR CPU implementation - Script and patch to run DeepSeek-OCR on CPU without CUDA
import argparse
import os
# Parse command line arguments
parser = argparse.ArgumentParser(description='DeepSeek OCR - Convert images to markdown')
parser.add_argument('input', type=str, help='Input image file path')
parser.add_argument('-o', '--output', type=str, default=None, help='Output directory path (default: input filename without extension)')
args = parser.parse_args()
# If output is not specified, use input filename without extension
if args.output is None:
args.output = os.path.splitext(args.input)[0]
from transformers import AutoModel, AutoTokenizer
import torch
#os.environ["CUDA_VISIBLE_DEVICES"] = '0'
model_name = 'deepseek-ai/DeepSeek-OCR'
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
model = AutoModel.from_pretrained(
model_name,
#_attn_implementation='flash_attention_2',
trust_remote_code=True,
use_safetensors=True,
device_map='cpu',
torch_dtype=torch.float32
)
#model = model.eval().cuda().to(torch.bfloat16)
model = model.eval()
# prompt = "<image>\nFree OCR. "
prompt = "<image>\n<|grounding|>Convert the document to markdown. "
image_file = args.input
output_path = args.output
# infer(self, tokenizer, prompt='', image_file='', output_path = ' ', base_size = 1024, image_size = 640, crop_mode = True, test_compress = False, save_results = False):
# Tiny: base_size = 512, image_size = 512, crop_mode = False
# Small: base_size = 640, image_size = 640, crop_mode = False
# Base: base_size = 1024, image_size = 1024, crop_mode = False
# Large: base_size = 1280, image_size = 1280, crop_mode = False
# Gundam: base_size = 1024, image_size = 640, crop_mode = True
res = model.infer(
tokenizer,
prompt=prompt,
image_file=image_file,
output_path = output_path,
base_size = 512,
image_size = 512,
crop_mode=False,
save_results = True,
test_compress = True)
--- modeling_deepseekocr.py.orig 2025-10-28 17:58:51.152672675 +0900
+++ modeling_deepseekocr.py 2025-10-28 15:09:16.841199336 +0900
@@ -502,7 +502,7 @@
images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
# exit()
- inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
+ inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).to(inputs_embeds.device), images_in_this_batch)
idx += 1
@@ -703,6 +703,10 @@
def infer(self, tokenizer, prompt='', image_file='', output_path = '', base_size=1024, image_size=640, crop_mode=True, test_compress=False, save_results=False, eval_mode=False):
self.disable_torch_init()
+ # Detect device and dtype
+ device = next(self.parameters()).device
+ dtype = torch.bfloat16 if device.type == 'cuda' else torch.float32
+
os.makedirs(output_path, exist_ok=True)
os.makedirs(f'{output_path}/images', exist_ok=True)
@@ -798,8 +802,8 @@
-
- images_list.append(image_transform(global_view).to(torch.bfloat16))
+
+ images_list.append(image_transform(global_view).to(dtype))
# global_view_tensor = image_transform(global_view).to(torch.bfloat16)
@@ -810,9 +814,9 @@
if width_crop_num > 1 or height_crop_num > 1:
"""process the local views"""
-
+
for i in range(len(images_crop_raw)):
- images_crop_list.append(image_transform(images_crop_raw[i]).to(torch.bfloat16))
+ images_crop_list.append(image_transform(images_crop_raw[i]).to(dtype))
if image_size == 640:
valid_img_tokens += len(images_crop_list) * 100
@@ -846,7 +850,7 @@
# else:
global_view = ImageOps.pad(image, (image_size, image_size),
color=tuple(int(x * 255) for x in image_transform.mean))
- images_list.append(image_transform(global_view).to(torch.bfloat16))
+ images_list.append(image_transform(global_view).to(dtype))
if base_size == 1024:
valid_img_tokens += int(256 * ratio)
@@ -911,12 +915,12 @@
if not eval_mode:
streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
- with torch.autocast("cuda", dtype=torch.bfloat16):
+ with torch.autocast(device.type, dtype=dtype):
with torch.no_grad():
output_ids = self.generate(
- input_ids.unsqueeze(0).cuda(),
- images=[(images_crop.cuda(), images_ori.cuda())],
- images_seq_mask = images_seq_mask.unsqueeze(0).cuda(),
+ input_ids.unsqueeze(0).to(device),
+ images=[(images_crop.to(device), images_ori.to(device))],
+ images_seq_mask = images_seq_mask.unsqueeze(0).to(device),
images_spatial_crop = images_spatial_crop,
# do_sample=False,
# num_beams = 1,
@@ -929,12 +933,12 @@
)
else:
- with torch.autocast("cuda", dtype=torch.bfloat16):
+ with torch.autocast(device.type, dtype=dtype):
with torch.no_grad():
output_ids = self.generate(
- input_ids.unsqueeze(0).cuda(),
- images=[(images_crop.cuda(), images_ori.cuda())],
- images_seq_mask = images_seq_mask.unsqueeze(0).cuda(),
+ input_ids.unsqueeze(0).to(device),
+ images=[(images_crop.to(device), images_ori.to(device))],
+ images_seq_mask = images_seq_mask.unsqueeze(0).to(device),
images_spatial_crop = images_spatial_crop,
# do_sample=False,
# num_beams = 1,
@@ -947,7 +951,7 @@
if '<image>' in conversation[0]['content'] and eval_mode:
- outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
+ outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(device).shape[1]:])
stop_str = '<|end▁of▁sentence|>'
if outputs.endswith(stop_str):
outputs = outputs[:-len(stop_str)]
@@ -957,7 +961,7 @@
return outputs
if '<image>' in conversation[0]['content'] and test_compress:
- outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
+ outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(device).shape[1]:])
pure_texts_outputs_token_length = len(text_encode(tokenizer, outputs, bos=False, eos=False))
print('='*50)
print('image size: ', (w, h))
@@ -968,7 +972,7 @@
if '<image>' in conversation[0]['content'] and save_results:
- outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:])
+ outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(device).shape[1]:])
stop_str = '<|end▁of▁sentence|>'
print('='*15 + 'save results:' + '='*15)
[project]
name = "deepseek-ocr-test"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"accelerate>=1.11.0",
"addict>=2.4.0",
"easydict>=1.13",
"einops>=0.8.1",
"hf-xet>=1.2.0",
"numpy<2",
"tokenizers==0.20.3",
"torch>=2.9.0",
"torchvision>=0.24.0",
"transformers==4.46.3",
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment