You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

130 lines
3.2 KiB

  1. /*
  2. * vertretungsplan.io custom crawler
  3. * Copyright (C) 2019 Jonas Lochmann
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License as
  7. * published by the Free Software Foundation, version 3 of the
  8. * License.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU Affero General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Affero General Public License
  16. * along with this program. If not, see <https://www.gnu.org/licenses/>.
  17. */
  18. import * as cheerio from 'cheerio'
  19. import * as request from 'request-promise-native'
  20. import { resolve } from 'url'
  21. import { generateId } from '../generate-id'
  22. import { getUrlInfo } from '../get-url-info'
  23. interface File {
  24. type: 'plan' | 'download'
  25. title: string
  26. mimeType: string
  27. lastModified?: number
  28. file: [{
  29. url: string
  30. sha512: string
  31. size: number
  32. }]
  33. id: string
  34. notify: boolean
  35. }
  36. interface Message {
  37. id: string
  38. title: string
  39. content: string
  40. notify: boolean
  41. }
  42. interface CrawlerResult {
  43. file: Array<File>
  44. message: Array<Message>
  45. }
  46. export async function crawl (): Promise<CrawlerResult> {
  47. const baseUrl = 'https://www.domgymnasium-verden.de/vertretungsplan.html'
  48. const content = await request(baseUrl)
  49. const $ = cheerio.load(content)
  50. const message: Array<Message> = []
  51. $('.ce_text').each((_, element) => {
  52. const title = $(element).find('h1').text().trim()
  53. const content = $(element).clone() // clone the element
  54. .find('h1') // select the headline
  55. .remove() // remove the headline
  56. .end() // again go back to selected element
  57. .text() // get text
  58. .trim() // remove whitespaces
  59. message.push({
  60. title,
  61. content,
  62. id: generateId(title),
  63. notify: true
  64. })
  65. })
  66. const fileBase: Array<{
  67. id: string
  68. title: string
  69. url: string
  70. type: 'plan' | 'download'
  71. notify: boolean
  72. }> = []
  73. $('.ce_downloads').each((index, element) => {
  74. $(element).find('li').find('a').each((_, element) => {
  75. const itemUrl = resolve(baseUrl, $(element).attr('href'))
  76. let title = $(element)
  77. .clone() // clone the element
  78. .children() // select all the children
  79. .remove() // remove all the children
  80. .end() // again go back to selected element
  81. .text()
  82. if (title.indexOf('.') !== -1) {
  83. title = title.substring(0, title.lastIndexOf('.'))
  84. }
  85. fileBase.push({
  86. id: generateId(itemUrl),
  87. title: title,
  88. url: itemUrl,
  89. type: (index === 0 ? 'plan' : 'download'),
  90. notify: index === 0
  91. })
  92. })
  93. })
  94. const file = await Promise.all(fileBase.map(async (file): Promise<File> => {
  95. const info = await getUrlInfo(file.url)
  96. return {
  97. id: file.id,
  98. notify: file.notify,
  99. type: file.type,
  100. lastModified: info.lastModified,
  101. mimeType: info.mimeType,
  102. title: file.title,
  103. file: [{
  104. url: file.url,
  105. sha512: info.sha512,
  106. size: info.size
  107. }]
  108. }
  109. }))
  110. return {
  111. message,
  112. file
  113. }
  114. }