delfer · July 19, 2024 16:43
diff --git a/describe.py b/describe.py
 import sys
 import os
 import base64
 from PIL import Image
 import textract
 import litellm
 import csv
 import io
 from PyPDF2 import PdfReader

 def extract_first_line_error(error_message):
    return error_message.split('\n')[0]

 def extract_text(file_path):
    if file_path.lower().endswith('.pdf'):
        return extract_text_from_pdf(file_path)
    try:
        return textract.process(file_path).decode('utf-8')
    except Exception as e:
        return f"Error extracting text: {extract_first_line_error(str(e))}"

 def extract_text_from_pdf(file_path):
    try:
        with open(file_path, 'rb') as file:
            reader = PdfReader(file)
            text = ""
            for page in reader.pages:
                text += page.extract_text() + "\n"
            return text
    except Exception as e:
        return f"Error extracting text from PDF: {extract_first_line_error(str(e))}"

 def process_image(file_path):
    try:
        with Image.open(file_path) as img:
            img_format = img.format
            if img_format is None:
                return f"Error processing image: unknown file format"
            max_size = max(img.size)
            if max_size > 1092:
                img = img.resize((1092, int(1092 * img.height / img.width)), resample=Image.LANCZOS)
            buffered = io.BytesIO()
            img.save(buffered, format=img_format.lower())
            return base64.b64encode(buffered.getvalue()).decode('utf-8')
    except Exception as e:
        return f"Error processing image: {extract_first_line_error(str(e))}"

 def summarize_content(file_name, content):
    if isinstance(content, bytes):
        try:
            content = content.decode('utf-8')
        except UnicodeDecodeError:
            content = base64.b64encode(content).decode('utf-8')

    file_extension = os.path.splitext(file_name)[1][1:].lower()

    if file_extension in ['png', 'jpg', 'jpeg', 'gif', 'bmp']:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Describe the content of the image '{file_name}' in one sentence:"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/{file_extension};base64,{content}",
                            "detail": "high"
                        }
                    }
                ]
            }
        ]
    else:
        messages = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": f"Describe the content of the file '{file_name}' in one sentence:"},
                    {"type": "text", "text": f"\n\nFile content:\n{content[:4000]}"}
                ]
            }
        ]

    try:
        response = litellm.completion(
            model="openai/gpt-4o-mini",
            messages=messages,
            max_tokens=300
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error summarizing content: {extract_first_line_error(str(e))}"

 def output_csv(file_name, content):
    csv_output = io.StringIO()
    writer = csv.writer(csv_output, quoting=csv.QUOTE_MINIMAL)
    writer.writerow([file_name, content])
    print(csv_output.getvalue().strip())

 def main(file_path):
    file_name = os.path.basename(file_path)
    try:
        if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.pdf')):
            if file_path.lower().endswith('.pdf'):
                content = extract_text_from_pdf(file_path)
            else:
                content = process_image(file_path)
        else:
            content = extract_text(file_path)
        
        summary = summarize_content(file_name, content)
        output_csv(file_name, summary)
    except Exception as e:
        output_csv(file_name, f"Unexpected error: {extract_first_line_error(str(e))}")

 if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python script.py <file_path>")
    else:
        main(sys.argv[1])
	import sys
	import os
	import base64
	from PIL import Image
	import textract
	import litellm
	import csv
	import io
	from PyPDF2 import PdfReader

	def extract_first_line_error(error_message):
	return error_message.split('\n')[0]

	def extract_text(file_path):
	if file_path.lower().endswith('.pdf'):
	return extract_text_from_pdf(file_path)
	try:
	return textract.process(file_path).decode('utf-8')
	except Exception as e:
	return f"Error extracting text: {extract_first_line_error(str(e))}"

	def extract_text_from_pdf(file_path):
	try:
	with open(file_path, 'rb') as file:
	reader = PdfReader(file)
	text = ""
	for page in reader.pages:
	text += page.extract_text() + "\n"
	return text
	except Exception as e:
	return f"Error extracting text from PDF: {extract_first_line_error(str(e))}"

	def process_image(file_path):
	try:
	with Image.open(file_path) as img:
	img_format = img.format
	if img_format is None:
	return f"Error processing image: unknown file format"
	max_size = max(img.size)
	if max_size > 1092:
	img = img.resize((1092, int(1092 * img.height / img.width)), resample=Image.LANCZOS)
	buffered = io.BytesIO()
	img.save(buffered, format=img_format.lower())
	return base64.b64encode(buffered.getvalue()).decode('utf-8')
	except Exception as e:
	return f"Error processing image: {extract_first_line_error(str(e))}"

	def summarize_content(file_name, content):
	if isinstance(content, bytes):
	try:
	content = content.decode('utf-8')
	except UnicodeDecodeError:
	content = base64.b64encode(content).decode('utf-8')

	file_extension = os.path.splitext(file_name)[1][1:].lower()

	if file_extension in ['png', 'jpg', 'jpeg', 'gif', 'bmp']:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": f"Describe the content of the image '{file_name}' in one sentence:"},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/{file_extension};base64,{content}",
	"detail": "high"
	}
	}
	]
	}
	]
	else:
	messages = [
	{
	"role": "user",
	"content": [
	{"type": "text", "text": f"Describe the content of the file '{file_name}' in one sentence:"},
	{"type": "text", "text": f"\n\nFile content:\n{content[:4000]}"}
	]
	}
	]

	try:
	response = litellm.completion(
	model="openai/gpt-4o-mini",
	messages=messages,
	max_tokens=300
	)
	return response.choices[0].message.content
	except Exception as e:
	return f"Error summarizing content: {extract_first_line_error(str(e))}"

	def output_csv(file_name, content):
	csv_output = io.StringIO()
	writer = csv.writer(csv_output, quoting=csv.QUOTE_MINIMAL)
	writer.writerow([file_name, content])
	print(csv_output.getvalue().strip())

	def main(file_path):
	file_name = os.path.basename(file_path)
	try:
	if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.pdf')):
	if file_path.lower().endswith('.pdf'):
	content = extract_text_from_pdf(file_path)
	else:
	content = process_image(file_path)
	else:
	content = extract_text(file_path)

	summary = summarize_content(file_name, content)
	output_csv(file_name, summary)
	except Exception as e:
	output_csv(file_name, f"Unexpected error: {extract_first_line_error(str(e))}")

	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print("Usage: python script.py <file_path>")
	else:
	main(sys.argv[1])
No results found