You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

81 lines
2.2 KiB

/*
* vertretungsplan.io custom crawler
* Copyright (C) 2019 Jonas Lochmann
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, version 3 of the
* License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import * as cheerio from 'cheerio'
import { createHash } from 'crypto'
import * as request from 'request-promise-native'
import { config as baseConfig } from '../config'
import { createHtmlExtractAllowStyling } from './html-extract'
const config = baseConfig.lbzBlindeHalle
const crawlCategory = async ({ planUrl, password }: {
planUrl: string
password: string
}) => {
const receivedResponse = await request({
url: planUrl,
resolveWithFullResponse: true,
method: 'POST',
form: {
pwd: password
}
})
const planPageContent = receivedResponse.body
const lastModified = new Date(receivedResponse.headers['last-modified'] || 'invalid').getTime()
const $ = cheerio.load(planPageContent)
const content = $('div[id=content]')
const title = $(content).find('h1').filter('.atitle').first()
const contentTitle = title.text()
title.remove()
const contentHtml = content.html()
if (!contentHtml) {
throw new Error('no content found')
}
const html = await createHtmlExtractAllowStyling(contentHtml)
const htmlBuffer = Buffer.from(html, 'utf8')
return {
content: htmlBuffer,
title: contentTitle,
lastModified,
size: htmlBuffer.length,
sha512: createHash('sha512').update(htmlBuffer).digest('hex')
}
}
export async function crawlCategories () {
const student = await crawlCategory({
planUrl: config.studentPlanUrl,
password: config.studentPassword
})
const teacher = await crawlCategory({
planUrl: config.teacherPlanUrl,
password: config.teacherPassword
})
return { student, teacher }
}