ein python script, das buchstaben (pdf) nach Farben zählt:
import sys
import pymupdf
import pprint
import webcolors as wc
doc = https://pymupdf.open(sys.argv[1])
page = doc[0]
# read page text as a dictionary, suppressing extra spaces in CJK fonts
blocks = page.get_text("dict", flags=11)["blocks"]
import sys
import pymupdf
import pprint
import webcolors as wc
doc = https://pymupdf.open(sys.argv[1])
page = doc[0]
# read page text as a dictionary, suppressing extra spaces in CJK fonts
blocks = page.get_text("dict", flags=11)["blocks"]
Comments
for b in blocks:
for l in b["lines"]:
for s in l["spans"]:
chars = len (s["text"])
color = wc.hex_to_name("#%06x" % (s['color']))
coll[color] = coll.get (color, 0) + chars
pprint.pprint (coll)