#http://nedbatchelder.com/blog/200712/extracting_jpgs_from_pdfs.html # jp2 version import sys,os filename = sys.argv[1] filenamebody = filename[0:filename.rindex(".")] os.mkdir(filenamebody) pdf = file(filename, "rb").read() startmark = "\x00\x00\x00\x0c\x6a\x50\x20\x20\x0d\x0a\x87\x0a\x00" startfix = 0 endmark = "\xff\xd9" endfix = 2 i = 0 njpg = 0 while True: istream = pdf.find("stream", i) if istream < 0: break istart = pdf.find(startmark, istream, istream+20) if istart < 0: i = istream+20 continue iend = pdf.find("endstream", istart) if iend < 0: raise Exception("Didn't find end of stream!") iend = pdf.find(endmark, iend-20) if iend < 0: raise Exception("Didn't find end of JP2!") istart += startfix iend += endfix print "JP2 %d from %d to %d" % (njpg, istart, iend) jpg = pdf[istart:iend] jpgfile = file(filenamebody + "/%03d.jp2" % njpg, "wb") jpgfile.write(jpg) jpgfile.close() njpg += 1 i = iend if njpg == 0 : os.rmdir(filenamebody)