You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

82 lines
2.2 KiB

  1. /*
  2. * vertretungsplan.io custom crawler
  3. * Copyright (C) 2019 Jonas Lochmann
  4. *
  5. * This program is free software: you can redistribute it and/or modify
  6. * it under the terms of the GNU Affero General Public License as
  7. * published by the Free Software Foundation, version 3 of the
  8. * License.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU Affero General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Affero General Public License
  16. * along with this program. If not, see <https://www.gnu.org/licenses/>.
  17. */
  18. import * as cheerio from 'cheerio'
  19. import { createHash } from 'crypto'
  20. import * as request from 'request-promise-native'
  21. import { config as baseConfig } from '../config'
  22. import { createHtmlExtractAllowStyling } from './html-extract'
  23. const config = baseConfig.lbzBlindeHalle
  24. const crawlCategory = async ({ planUrl, password }: {
  25. planUrl: string
  26. password: string
  27. }) => {
  28. const receivedResponse = await request({
  29. url: planUrl,
  30. resolveWithFullResponse: true,
  31. method: 'POST',
  32. form: {
  33. pwd: password
  34. }
  35. })
  36. const planPageContent = receivedResponse.body
  37. const lastModified = new Date(receivedResponse.headers['last-modified'] || 'invalid').getTime()
  38. const $ = cheerio.load(planPageContent)
  39. const content = $('div[id=content]')
  40. const title = $(content).find('h1').filter('.atitle').first()
  41. const contentTitle = title.text()
  42. title.remove()
  43. const contentHtml = content.html()
  44. if (!contentHtml) {
  45. throw new Error('no content found')
  46. }
  47. const html = await createHtmlExtractAllowStyling(contentHtml)
  48. const htmlBuffer = Buffer.from(html, 'utf8')
  49. return {
  50. content: htmlBuffer,
  51. title: contentTitle,
  52. lastModified,
  53. size: htmlBuffer.length,
  54. sha512: createHash('sha512').update(htmlBuffer).digest('hex')
  55. }
  56. }
  57. export async function crawlCategories () {
  58. const student = await crawlCategory({
  59. planUrl: config.studentPlanUrl,
  60. password: config.studentPassword
  61. })
  62. const teacher = await crawlCategory({
  63. planUrl: config.teacherPlanUrl,
  64. password: config.teacherPassword
  65. })
  66. return { student, teacher }
  67. }