[initial version Ganesh Sittampalam **20100405112141 Ignore-this: 77e565944a40ef01a64258102231afba ] addfile ./CaseList.hs hunk ./CaseList.hs 1 +{-# LANGUAGE ViewPatterns, PatternGuards, NoMonomorphismRestriction #-} +module CaseList where hunk ./CaseList.hs 4 +import Control.Applicative +import Control.Monad +import Data.List +import Data.Maybe +import Text.HTML.TagSoup +import Text.HTML.TagSoup.Tree +import System.Process +import System.Environment +import System.Exit +import System.IO + +main = do + args <- getArgs + case args of + [file] -> do go file + return () + _ -> usage + +usage = do + name <- getProgName + hPutStrLn stderr $ "usage: " ++ name ++ " " + exitWith (ExitFailure 1) + +revTake n = reverse . take n . reverse +revDrop n = reverse . drop n . reverse + +hasSuffix x str | x `isSuffixOf` str = Just (revDrop (length x) str) + | otherwise = Nothing + + +isJunk (TagText str) | all (\c -> c=='\n' || c=='\t') str = True +isJunk _ = False + +recogniseDoc [TagLeaf (TagOpen "?xml" [("version","1.0"),("encoding","UTF-8")]), + TagLeaf (TagOpen "!DOCTYPE" [("pdf2xml",""),("SYSTEM",""),("","pdf2xml.dtd")]), + TagBranch "pdf2xml" [] pages] + = concat <$> mapM recognisePage pages +recogniseDoc _ = fail "top-level structure" + +recognisePage (TagBranch "page" _ items) = tabulate <$> catMaybes <$> mapM recogniseItem items + +recognisePage x = fail $ "page: " ++ show x + +tabulate (((row, column), info):rest@(((row', column'), _):_)) + | column == column' = ((fst info || fst x, snd info ++ snd x):xs):xss + | row == row' = (info:x:xs):xss + | otherwise = [info]:(x:xs):xss + where (x:xs):xss = tabulate rest +tabulate [(_, info)] = [[info]] +tabulate [] = [] + +recogniseItem (TagBranch "text" attrs [TagBranch "b" [] [TagLeaf (TagText text)]]) + | Just top <- lookup "top" attrs, Just left <- lookup "left" attrs + = return $ Just ((top, left), (True, text)) +recogniseItem (TagBranch "text" attrs [TagLeaf (TagText text)]) + | Just top <- lookup "top" attrs, Just left <- lookup "left" attrs + = return $ Just ((top, left), (False, text)) +recogniseItem (TagBranch "fontspec" _ _) = return Nothing +recogniseItem x = fail $ "item: " ++ show x + +go file@(hasSuffix ".pdf" -> Just stem) = do + (_, _, _, ph) <- createProcess (proc "/usr/bin/pdftohtml" ["-c", "-xml", file]) + _ <- waitForProcess ph + table <- join $ recogniseDoc <$> tagTree <$> filter (not . isJunk) <$> parseTags <$> readFile (stem ++ ".xml") + writeFile (stem ++ ".html") ("" ++ concat ["" ++ concat ["" | (title, text) <- row] ++ "" | row <- table] ++ "
" ++ (if title then "" ++ text ++ "" else text) ++ "
") + +go _ = usage