Last active
October 28, 2025 09:16
-
-
Save 7shi/464ead6f0f8fea0ad2e12f52cf523936 to your computer and use it in GitHub Desktop.
DeepSeek OCR CPU implementation - Script and patch to run DeepSeek-OCR on CPU without CUDA
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import argparse | |
| import os | |
| # Parse command line arguments | |
| parser = argparse.ArgumentParser(description='DeepSeek OCR - Convert images to markdown') | |
| parser.add_argument('input', type=str, help='Input image file path') | |
| parser.add_argument('-o', '--output', type=str, default=None, help='Output directory path (default: input filename without extension)') | |
| args = parser.parse_args() | |
| # If output is not specified, use input filename without extension | |
| if args.output is None: | |
| args.output = os.path.splitext(args.input)[0] | |
| from transformers import AutoModel, AutoTokenizer | |
| import torch | |
| #os.environ["CUDA_VISIBLE_DEVICES"] = '0' | |
| model_name = 'deepseek-ai/DeepSeek-OCR' | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| trust_remote_code=True | |
| ) | |
| model = AutoModel.from_pretrained( | |
| model_name, | |
| #_attn_implementation='flash_attention_2', | |
| trust_remote_code=True, | |
| use_safetensors=True, | |
| device_map='cpu', | |
| torch_dtype=torch.float32 | |
| ) | |
| #model = model.eval().cuda().to(torch.bfloat16) | |
| model = model.eval() | |
| # prompt = "<image>\nFree OCR. " | |
| prompt = "<image>\n<|grounding|>Convert the document to markdown. " | |
| image_file = args.input | |
| output_path = args.output | |
| # infer(self, tokenizer, prompt='', image_file='', output_path = ' ', base_size = 1024, image_size = 640, crop_mode = True, test_compress = False, save_results = False): | |
| # Tiny: base_size = 512, image_size = 512, crop_mode = False | |
| # Small: base_size = 640, image_size = 640, crop_mode = False | |
| # Base: base_size = 1024, image_size = 1024, crop_mode = False | |
| # Large: base_size = 1280, image_size = 1280, crop_mode = False | |
| # Gundam: base_size = 1024, image_size = 640, crop_mode = True | |
| res = model.infer( | |
| tokenizer, | |
| prompt=prompt, | |
| image_file=image_file, | |
| output_path = output_path, | |
| base_size = 512, | |
| image_size = 512, | |
| crop_mode=False, | |
| save_results = True, | |
| test_compress = True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- modeling_deepseekocr.py.orig 2025-10-28 17:58:51.152672675 +0900 | |
| +++ modeling_deepseekocr.py 2025-10-28 15:09:16.841199336 +0900 | |
| @@ -502,7 +502,7 @@ | |
| images_in_this_batch = torch.cat(images_in_this_batch, dim=0) | |
| # exit() | |
| - inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch) | |
| + inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).to(inputs_embeds.device), images_in_this_batch) | |
| idx += 1 | |
| @@ -703,6 +703,10 @@ | |
| def infer(self, tokenizer, prompt='', image_file='', output_path = '', base_size=1024, image_size=640, crop_mode=True, test_compress=False, save_results=False, eval_mode=False): | |
| self.disable_torch_init() | |
| + # Detect device and dtype | |
| + device = next(self.parameters()).device | |
| + dtype = torch.bfloat16 if device.type == 'cuda' else torch.float32 | |
| + | |
| os.makedirs(output_path, exist_ok=True) | |
| os.makedirs(f'{output_path}/images', exist_ok=True) | |
| @@ -798,8 +802,8 @@ | |
| - | |
| - images_list.append(image_transform(global_view).to(torch.bfloat16)) | |
| + | |
| + images_list.append(image_transform(global_view).to(dtype)) | |
| # global_view_tensor = image_transform(global_view).to(torch.bfloat16) | |
| @@ -810,9 +814,9 @@ | |
| if width_crop_num > 1 or height_crop_num > 1: | |
| """process the local views""" | |
| - | |
| + | |
| for i in range(len(images_crop_raw)): | |
| - images_crop_list.append(image_transform(images_crop_raw[i]).to(torch.bfloat16)) | |
| + images_crop_list.append(image_transform(images_crop_raw[i]).to(dtype)) | |
| if image_size == 640: | |
| valid_img_tokens += len(images_crop_list) * 100 | |
| @@ -846,7 +850,7 @@ | |
| # else: | |
| global_view = ImageOps.pad(image, (image_size, image_size), | |
| color=tuple(int(x * 255) for x in image_transform.mean)) | |
| - images_list.append(image_transform(global_view).to(torch.bfloat16)) | |
| + images_list.append(image_transform(global_view).to(dtype)) | |
| if base_size == 1024: | |
| valid_img_tokens += int(256 * ratio) | |
| @@ -911,12 +915,12 @@ | |
| if not eval_mode: | |
| streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False) | |
| - with torch.autocast("cuda", dtype=torch.bfloat16): | |
| + with torch.autocast(device.type, dtype=dtype): | |
| with torch.no_grad(): | |
| output_ids = self.generate( | |
| - input_ids.unsqueeze(0).cuda(), | |
| - images=[(images_crop.cuda(), images_ori.cuda())], | |
| - images_seq_mask = images_seq_mask.unsqueeze(0).cuda(), | |
| + input_ids.unsqueeze(0).to(device), | |
| + images=[(images_crop.to(device), images_ori.to(device))], | |
| + images_seq_mask = images_seq_mask.unsqueeze(0).to(device), | |
| images_spatial_crop = images_spatial_crop, | |
| # do_sample=False, | |
| # num_beams = 1, | |
| @@ -929,12 +933,12 @@ | |
| ) | |
| else: | |
| - with torch.autocast("cuda", dtype=torch.bfloat16): | |
| + with torch.autocast(device.type, dtype=dtype): | |
| with torch.no_grad(): | |
| output_ids = self.generate( | |
| - input_ids.unsqueeze(0).cuda(), | |
| - images=[(images_crop.cuda(), images_ori.cuda())], | |
| - images_seq_mask = images_seq_mask.unsqueeze(0).cuda(), | |
| + input_ids.unsqueeze(0).to(device), | |
| + images=[(images_crop.to(device), images_ori.to(device))], | |
| + images_seq_mask = images_seq_mask.unsqueeze(0).to(device), | |
| images_spatial_crop = images_spatial_crop, | |
| # do_sample=False, | |
| # num_beams = 1, | |
| @@ -947,7 +951,7 @@ | |
| if '<image>' in conversation[0]['content'] and eval_mode: | |
| - outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:]) | |
| + outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(device).shape[1]:]) | |
| stop_str = '<|end▁of▁sentence|>' | |
| if outputs.endswith(stop_str): | |
| outputs = outputs[:-len(stop_str)] | |
| @@ -957,7 +961,7 @@ | |
| return outputs | |
| if '<image>' in conversation[0]['content'] and test_compress: | |
| - outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:]) | |
| + outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(device).shape[1]:]) | |
| pure_texts_outputs_token_length = len(text_encode(tokenizer, outputs, bos=False, eos=False)) | |
| print('='*50) | |
| print('image size: ', (w, h)) | |
| @@ -968,7 +972,7 @@ | |
| if '<image>' in conversation[0]['content'] and save_results: | |
| - outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:]) | |
| + outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).to(device).shape[1]:]) | |
| stop_str = '<|end▁of▁sentence|>' | |
| print('='*15 + 'save results:' + '='*15) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [project] | |
| name = "deepseek-ocr-test" | |
| version = "0.1.0" | |
| description = "Add your description here" | |
| readme = "README.md" | |
| requires-python = ">=3.10" | |
| dependencies = [ | |
| "accelerate>=1.11.0", | |
| "addict>=2.4.0", | |
| "easydict>=1.13", | |
| "einops>=0.8.1", | |
| "hf-xet>=1.2.0", | |
| "numpy<2", | |
| "tokenizers==0.20.3", | |
| "torch>=2.9.0", | |
| "torchvision>=0.24.0", | |
| "transformers==4.46.3", | |
| ] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment