You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

134 lines
3.8KB

  1. /*
  2. * vertretungsplan.io custom crawler
  3. * Copyright (C) 2019 Jonas Lochmann
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License as
  7. * published by the Free Software Foundation, version 3 of the
  8. * License.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU Affero General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Affero General Public License
  16. * along with this program. If not, see <https://www.gnu.org/licenses/>.
  17. */
  18. import * as cheerio from 'cheerio'
  19. import { minify } from 'html-minifier'
  20. import { max, sum } from 'lodash'
  21. import * as sanitizeHtml from 'sanitize-html'
  22. export const cleanHtmlAllowStyling = (snippet: string) => {
  23. const base = sanitizeHtml(snippet, {
  24. allowedTags: ['b', 'i', 'a', 'br', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table', 'tbody', 'tr', 'td', 'th', 'strike', 'span', 'strong'],
  25. allowedAttributes: {
  26. a: ['href', /* will be modified below */ 'target'],
  27. table: ['border'],
  28. td: ['colspan', 'style', 'rowspan'],
  29. span: ['style']
  30. },
  31. allowProtocolRelative: false,
  32. transformTags: {
  33. a: (tagName: string, attribs: object) => ({
  34. tagName,
  35. attribs: {
  36. ...attribs,
  37. target: '_blank'
  38. }
  39. }),
  40. table: (tagName: string, attribs: object) => ({
  41. tagName,
  42. attribs: {
  43. ...attribs,
  44. border: '1'
  45. }
  46. })
  47. },
  48. allowedStyles: {
  49. '*': {
  50. color: [/^\#(0x)?[0-9a-f]+$/i, /^rgb\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)$/],
  51. 'background-color': [/^\#(0x)?[0-9a-f]+$/i, /^rgb\(\s*(\d{1,3})\s*,\s*(\d{1,3})\s*,\s*(\d{1,3})\s*\)$/]
  52. }
  53. }
  54. })
  55. // merge the 2 columns if they exist
  56. const $ = cheerio.load(base)
  57. let mergeTd: Cheerio | null = null
  58. $('table').each((_, element) => {
  59. const table = $(element)
  60. mergeTd = null
  61. table.find('tr').each((_, element) => {
  62. const tr = $(element)
  63. if (tr.children().length === 1) {
  64. if (tr.children().first().filter('td[rowspan=2]').length === 1) {
  65. mergeTd = tr.children().first()
  66. tr.remove()
  67. }
  68. } else if (mergeTd !== null) {
  69. tr.prepend(mergeTd)
  70. mergeTd = null
  71. }
  72. })
  73. })
  74. // delete all rowspans
  75. $('td').removeAttr('rowspan')
  76. // fill up all table rwos
  77. $('table').each((_, element) => {
  78. const table = $(element)
  79. function trWidth (tr: Cheerio) {
  80. return sum(tr.find('td').map((_, element) => {
  81. const colSpan = parseInt($(element).attr('colspan') || '1', 10)
  82. if (Number.isInteger(colSpan) && colSpan >= 1) {
  83. return colSpan
  84. } else {
  85. return 1
  86. }
  87. }))
  88. }
  89. const tableWidth = max(
  90. table.find('tr').map((_, element) => trWidth($(element))) as any as Array<number>
  91. ) || 0
  92. table.find('tr').each((_, element) => {
  93. const tr = $(element)
  94. const width = trWidth(tr)
  95. const missingWidth = tableWidth - width
  96. if (Number.isInteger(missingWidth)) {
  97. for (let i = 0; i < missingWidth; i++) {
  98. tr.append('<td />')
  99. }
  100. }
  101. })
  102. })
  103. return $.html()
  104. }
  105. const minifyHtml = (html: string) => minify(html, {
  106. collapseWhitespace: true,
  107. removeComments: true
  108. })
  109. export const createHtmlExtractAllowStyling = async (htmlExtract: string) => {
  110. return wrapHtmlExtractUnsafe(minifyHtml(cleanHtmlAllowStyling(htmlExtract)))
  111. }
  112. const wrapHtmlExtractUnsafe = async (unsafeHtml: string) => (
  113. '<!DOCTYPE html><html><head><meta http-equiv="content-type" content="text/html; charset=utf8" /></head><body>' + unsafeHtml + '</body></html>'
  114. )