[refactor around data structure Ganesh Sittampalam **20100405191559 Ignore-this: 6a033c5d0a5cd37d275534667a254074 ] hunk ./CaseList.hs 46 +data Doc = Doc { docDate :: String + , docHeader :: [String] + , docRecords :: [[String]] + } + +docToCSV doc = docHeader doc : docRecords doc + +docToHTML doc = "" ++ docDate doc ++ "\n" + ++ concat (row "th" (docHeader doc):map (row "td") (docRecords doc)) + ++ "
\n" + where row elem items = " \n" ++ concat [" <" ++ elem ++ ">" ++ item ++ "\n" | item <- items] ++ " \n" + hunk ./CaseList.hs 61 +data LineInfo = LineInfo { liHeader :: Maybe [String] + , liDate :: Maybe String + , liRecords :: [[String]] + } + +baseLineInfo = LineInfo { liHeader = Nothing, liDate = Nothing, liRecords = [] } + +lineToLineInfo [(True, hasPrefix "Freedom of Information and Environmental Information Regulations caseload snapshot in " -> Just date)] + = return $ baseLineInfo { liDate = Just date } +lineToLineInfo [(False, hasPrefix "This report provides a snapshot of the Freedom of Information and Environmental Information Regulations cases " -> Just _)] = return baseLineInfo +lineToLineInfo recs | all fst recs = return $ baseLineInfo { liHeader = Just (map snd recs) } +lineToLineInfo recs | all (not . fst) recs = return $ baseLineInfo { liRecords = [map snd recs] } +lineToLineInfo recs = fail $ "inconsistent titling: " ++ show recs + +joinLineInfo li1 li2 = do hdr <- joinMaybes "multiple header rows found" (liHeader li1) (liHeader li2) + date <- joinMaybes "multiple dates found" (liDate li1) (liDate li2) + let recs = liRecords li1 ++ liRecords li2 + return $ LineInfo { liHeader = hdr, liDate = date, liRecords = recs } + +joinMaybes _ Nothing Nothing = return Nothing +joinMaybes _ (Just x) Nothing = return $ Just x +joinMaybes _ Nothing (Just x) = return $ Just x +joinMaybes err _ _ = fail err + hunk ./CaseList.hs 88 - = concat <$> mapM recognisePage pages + = do lines <- concat <$> mapM recognisePage pages + lineInfo <- foldM (flip joinLineInfo) baseLineInfo =<< mapM lineToLineInfo lines + date <- case liDate lineInfo of + Nothing -> fail "couldn't find date" + Just r -> return r + hdr <- case liHeader lineInfo of + Nothing -> fail "couldn't find header row" + Just r -> return r + return $ Doc { docDate = date + , docHeader = hdr + , docRecords = liRecords lineInfo + } + hunk ./CaseList.hs 103 -recognisePage (TagBranch "page" _ items) = filter (not . tabulatedJunk) <$> tabulate <$> catMaybes <$> mapM recogniseItem items +recognisePage (TagBranch "page" _ items) = tabulate <$> catMaybes <$> mapM recogniseItem items hunk ./CaseList.hs 118 -tabulatedJunk [(True, hasPrefix "Freedom of Information and Environmental Information Regulations caseload snapshot in " -> Just _)] = True -tabulatedJunk [(False, hasPrefix "This report provides a snapshot of the Freedom of Information and Environmental Information Regulations cases " -> Just _)] = True -tabulatedJunk _ = False - - hunk ./CaseList.hs 132 - table <- join $ recogniseDoc <$> tagTree <$> filter (not . isJunk) <$> parseTags <$> readFile (stem ++ ".xml") - writeFile (stem ++ ".html") ("\n" ++ concat [" \n" ++ concat [" \n" | (title, text) <- row] ++ " \n" | row <- table] ++ "
" ++ (if title then "" ++ text ++ "" else text) ++ "
\n") - writeFile (stem ++ ".csv") (printCSV (map (map snd) table)) + doc <- join $ recogniseDoc <$> tagTree <$> filter (not . isJunk) <$> parseTags <$> readFile (stem ++ ".xml") + writeFile (stem ++ ".html") (docToHTML doc) + writeFile (stem ++ ".csv") (printCSV (docToCSV doc))