/*
|
|
* vertretungsplan.io custom crawler
|
|
* Copyright (C) 2019 Jonas Lochmann
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as
|
|
* published by the Free Software Foundation, version 3 of the
|
|
* License.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Affero General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
import * as cheerio from 'cheerio'
|
|
import { minify } from 'html-minifier'
|
|
import { max, sum } from 'lodash'
|
|
import * as sanitizeHtml from 'sanitize-html'
|
|
|
|
export const cleanHtmlAllowStyling = (snippet: string) => {
|
|
const base = sanitizeHtml(snippet, {
|
|
allowedTags: ['b', 'i', 'a', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table', 'tbody', 'tr', 'td', 'th', 'strike', 'span', 'strong'],
|
|
allowedAttributes: {
|
|
a: ['href', /* will be modified below */ 'target'],
|
|
table: ['border'],
|
|
td: ['colspan', 'style', 'rowspan'],
|
|
span: ['style']
|
|
},
|
|
allowProtocolRelative: false,
|
|
transformTags: {
|
|
a: (tagName: string, attribs: object) => ({
|
|
tagName,
|
|
attribs: {
|
|
...attribs,
|
|
target: '_blank'
|
|
}
|
|
}),
|
|
table: (tagName: string, attribs: object) => ({
|
|
tagName,
|
|
attribs: {
|
|
...attribs,
|
|
border: '1'
|
|
}
|
|
})
|
|
},
|
|
allowedStyles: {
|
|
'*': {
|
|
color: [/^\#(0x)?[0-9a-f]+$/i, /^rgb\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)$/],
|
|
'background-color': [/^\#(0x)?[0-9a-f]+$/i, /^rgb\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)$/]
|
|
}
|
|
}
|
|
})
|
|
|
|
// merge the 2 columns if they exist
|
|
const $ = cheerio.load(base)
|
|
let mergeTd: Cheerio | null = null
|
|
|
|
$('table').each((_, element) => {
|
|
const table = $(element)
|
|
|
|
mergeTd = null
|
|
|
|
table.find('tr').each((_, element) => {
|
|
const tr = $(element)
|
|
|
|
if (tr.children().length === 1) {
|
|
if (tr.children().first().filter('td[rowspan=2]').length === 1) {
|
|
mergeTd = tr.children().first()
|
|
|
|
tr.remove()
|
|
}
|
|
} else if (mergeTd !== null) {
|
|
tr.prepend(mergeTd)
|
|
mergeTd = null
|
|
}
|
|
})
|
|
})
|
|
|
|
// delete all rowspans
|
|
$('td').removeAttr('rowspan')
|
|
|
|
// fill up all table rwos
|
|
$('table').each((_, element) => {
|
|
const table = $(element)
|
|
|
|
function trWidth (tr: Cheerio) {
|
|
return sum(tr.find('td').map((_, element) => {
|
|
const colSpan = parseInt($(element).attr('colspan') || '1', 10)
|
|
|
|
if (Number.isInteger(colSpan) && colSpan >= 1) {
|
|
return colSpan
|
|
} else {
|
|
return 1
|
|
}
|
|
}))
|
|
}
|
|
|
|
const tableWidth = max(
|
|
table.find('tr').map((_, element) => trWidth($(element))) as any as Array<number>
|
|
) || 0
|
|
|
|
table.find('tr').each((_, element) => {
|
|
const tr = $(element)
|
|
const width = trWidth(tr)
|
|
const missingWidth = tableWidth - width
|
|
|
|
if (Number.isInteger(missingWidth)) {
|
|
for (let i = 0; i < missingWidth; i++) {
|
|
tr.append('<td />')
|
|
}
|
|
}
|
|
})
|
|
})
|
|
|
|
return $.html()
|
|
}
|
|
|
|
const minifyHtml = (html: string) => minify(html, {
|
|
collapseWhitespace: true,
|
|
removeComments: true
|
|
})
|
|
|
|
export const createHtmlExtractAllowStyling = async (htmlExtract: string) => {
|
|
return wrapHtmlExtractUnsafe(minifyHtml(cleanHtmlAllowStyling(htmlExtract)))
|
|
}
|
|
|
|
const wrapHtmlExtractUnsafe = async (unsafeHtml: string) => (
|
|
'<!DOCTYPE html><html><head><meta http-equiv="content-type" content="text/html; charset=utf8" /></head><body>' + unsafeHtml + '</body></html>'
|
|
)
|