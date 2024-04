from pdfminer.high_level import extract_text

from pathlib import Path



# PDFファイルからテキストを抽出

source = Path('atmarkit_ebook116.pdf')



text = extract_text(source)

print(text)



# extract_text_to_fp関数を使う

from pdfminer.high_level import extract_text_to_fp



dest = Path('out.txt')

with open(source, 'rb') as fp_in, open(dest, 'wb') as fp_out:

extract_text_to_fp(fp_in, fp_out)



print(Path('out.txt').read_text(encoding='utf-8'))



# extract_text_to_fp関数でメモリにテキストを読み込むにはStringIOを使う

from io import StringIO



fp_out = StringIO()

with open(source, 'rb') as fp_in:

extract_text_to_fp(fp_in, fp_out)

text = fp_out.getvalue()

text = text.replace(chr(12), chr(10))

print(text)



# 画像ファイルの保存

output_dir = 'imgs'

fp_out = StringIO()

with open(source, 'rb') as fp_in:

text = extract_text_to_fp(fp_in, fp_out, output_dir=output_dir)