You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

133 lines
3.8 KiB

/*
* vertretungsplan.io custom crawler
* Copyright (C) 2019 Jonas Lochmann
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, version 3 of the
* License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import * as cheerio from 'cheerio'
import { minify } from 'html-minifier'
import { max, sum } from 'lodash'
import * as sanitizeHtml from 'sanitize-html'
export const cleanHtmlAllowStyling = (snippet: string) => {
const base = sanitizeHtml(snippet, {
allowedTags: ['b', 'i', 'a', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table', 'tbody', 'tr', 'td', 'th', 'strike', 'span', 'strong'],
allowedAttributes: {
a: ['href', /* will be modified below */ 'target'],
table: ['border'],
td: ['colspan', 'style', 'rowspan'],
span: ['style']
},
allowProtocolRelative: false,
transformTags: {
a: (tagName: string, attribs: object) => ({
tagName,
attribs: {
...attribs,
target: '_blank'
}
}),
table: (tagName: string, attribs: object) => ({
tagName,
attribs: {
...attribs,
border: '1'
}
})
},
allowedStyles: {
'*': {
color: [/^\#(0x)?[0-9a-f]+$/i, /^rgb\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)$/],
'background-color': [/^\#(0x)?[0-9a-f]+$/i, /^rgb\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)$/]
}
}
})
// merge the 2 columns if they exist
const $ = cheerio.load(base)
let mergeTd: Cheerio | null = null
$('table').each((_, element) => {
const table = $(element)
mergeTd = null
table.find('tr').each((_, element) => {
const tr = $(element)
if (tr.children().length === 1) {
if (tr.children().first().filter('td[rowspan=2]').length === 1) {
mergeTd = tr.children().first()
tr.remove()
}
} else if (mergeTd !== null) {
tr.prepend(mergeTd)
mergeTd = null
}
})
})
// delete all rowspans
$('td').removeAttr('rowspan')
// fill up all table rwos
$('table').each((_, element) => {
const table = $(element)
function trWidth (tr: Cheerio) {
return sum(tr.find('td').map((_, element) => {
const colSpan = parseInt($(element).attr('colspan') || '1', 10)
if (Number.isInteger(colSpan) && colSpan >= 1) {
return colSpan
} else {
return 1
}
}))
}
const tableWidth = max(
table.find('tr').map((_, element) => trWidth($(element))) as any as Array<number>
) || 0
table.find('tr').each((_, element) => {
const tr = $(element)
const width = trWidth(tr)
const missingWidth = tableWidth - width
if (Number.isInteger(missingWidth)) {
for (let i = 0; i < missingWidth; i++) {
tr.append('<td />')
}
}
})
})
return $.html()
}
const minifyHtml = (html: string) => minify(html, {
collapseWhitespace: true,
removeComments: true
})
export const createHtmlExtractAllowStyling = async (htmlExtract: string) => {
return wrapHtmlExtractUnsafe(minifyHtml(cleanHtmlAllowStyling(htmlExtract)))
}
const wrapHtmlExtractUnsafe = async (unsafeHtml: string) => (
'<!DOCTYPE html><html><head><meta http-equiv="content-type" content="text/html; charset=utf8" /></head><body>' + unsafeHtml + '</body></html>'
)