Rectangle 27 0

html how to display pdf file contents as well as its full name in the browser using cgi python script?


#!/usr/bin/env python
import cgi
import cgitb; cgitb.enable()
import os
from itertools import imap
from subprocess import check_output

PDFINFO = '/usr/bin/pdfinfo'
CONVERT = '/usr/bin/convert'
DOC_ROOT = '/home/bj/Documents'

BASE_TEMPLATE = (
    'Content-type: text/html\n\n'
    '<html><head><title>{title}</title></head><body>{body}</body></html>'
)
PDF_PAGE_TEMPLATE = (
    '<h1>{filename}</h1>'
    '<p>{prev_link} {page}/{page_count} {next_link}</p>'
    '<p><img src="{image_url}" style="border: solid thin gray;"></p>'
)

SCRIPT_NAME = os.environ['SCRIPT_NAME']


def create_page_url(filename, page_number, type_):
    return '{0}?file={1}&page={2}&type={3}'.format(
        cgi.escape(SCRIPT_NAME, True),
        cgi.escape(filename, True),
        page_number,
        type_
    )


def create_page_link(text, filename, page_number):
    text = cgi.escape(text)
    if page_number is None:
        return '<span style="color: gray;">{0}</span>'.format(text)
    else:
        return '<a href="{0}">{1}</a>'.format(
            create_page_url(filename, page_number, 'html'), text
        )


def get_page_count(filename):

    def parse_line(line):
        key, _, value = line.partition(':')
        return key, value.strip()

    info = dict(
        imap(parse_line, check_output([PDFINFO, filename]).splitlines())
    )
    return int(info['Pages'])


def get_page(filename, page_index):
    return check_output(
        [
            CONVERT,
            '-density', '96',
            '{0}[{1}]'.format(filename, page_index),
            'jpg:-'
        ]
    )


def send_error(message):
    print BASE_TEMPLATE.format(
        title='Error', body='<h1>Error</h1>{0}'.format(message)
    )


def send_page_html(_pdf_path, filename, page_number, page_count):
    body = PDF_PAGE_TEMPLATE.format(
        filename=cgi.escape(filename),
        page=page_number,
        page_count=page_count,
        image_url=create_page_url(filename, page_number, 'jpg'),
        prev_link=create_page_link(
            '<<', filename, page_number - 1 if page_number > 1 else None
        ),
        next_link=create_page_link(
            '>>',
            filename,
            page_number + 1 if page_number < page_count else None
        )
    )
    print BASE_TEMPLATE.format(title='PDF', body=body)


def send_page_image(pdf_path, _filename, page_number, _page_count):
    image_data = get_page(pdf_path, page_number - 1)
    print 'Content-type: image/jpg'
    print 'Content-Length:', len(image_data)
    print
    print image_data


TYPE2SEND_FUNCTION = {
    'html': send_page_html,
    'jpg': send_page_image,
}


def main():
    form = cgi.FieldStorage()
    filename = form.getfirst('file')
    page_number = int(form.getfirst('page', 1))
    type_ = form.getfirst('type', 'html')

    pdf_path = os.path.abspath(os.path.join(DOC_ROOT, filename))
    if os.path.exists(pdf_path) and pdf_path.startswith(DOC_ROOT):
        page_count = get_page_count(pdf_path)
        page_number = min(max(1, page_number), page_count)
        TYPE2SEND_FUNCTION[type_](pdf_path, filename, page_number, page_count)
    else:
        send_error(
            '<p>PDF file <em>{0!r}</em> not found.</p>'.format(
                cgi.escape(filename)
            )
        )


main()
<embed>
<object>

@user956424 I've added a CGI that renders the pages on the server. Why no JavaScript? Why CGI (instead of WSGI)? Where do these requirements come from?

A fallback solution, working on every browser, would be rendering the PDF pages on the server as images and serve those to the client. This puts some stress on the server (processor, memory/disk for caching, bandwidth).

Can you suggest an alternative solution where this can be achieved?

Given the other constraints this is not possible IMHO. You'll have to allow JavaScript and/or third party browser plugins and have to live with the fact not every browser will be able to display it the way you want. You may also pre-render and/or cache page images on the server to lower the CPU load with a solution that renders the pages on the server.

It is not possible. At least not that simple. Some web browsers don't display PDFs but ask the user to download the file, some display them themselves, some embed an external PDF viewer component, some start an external PDF viewer. There is no standard, cross browser way to embed PDF into HTML, which would be needed if you want to display arbitrary text and the PDF content.

Rendering and serving the PDF pages as images needs some software on the server to query the number of pages and to extract and render a given page as image.

Some modern, HTML5 capable browsers can render PDFs with Mozilla's pdf.js on a canvas element.

The number of pages can be determined with the pdfinfo program from Xpdf or the libpoppler command line utilities. Converting a page from the PDF file to a JPG image can be done with convert from the ImageMagick tools. A very simple CGI program using these programs:

There is Python bindings for libpoppler, so the call to the external pdfinfo program could be replaced quite easily with that module. It may also be used to extract more information for the pages like links on the PDF pages to create HTML image maps for them. With the libcairo Python bindings installed it may be even possible to do the rendering of a page without an external process.

Note