You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

129 lines
3.2 KiB

/*
* vertretungsplan.io custom crawler
* Copyright (C) 2019 Jonas Lochmann
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, version 3 of the
* License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import * as cheerio from 'cheerio'
import * as request from 'request-promise-native'
import { resolve } from 'url'
import { generateId } from '../generate-id'
import { getUrlInfo } from '../get-url-info'
interface File {
type: 'plan' | 'download'
title: string
mimeType: string
lastModified?: number
file: [{
url: string
sha512: string
size: number
}]
id: string
notify: boolean
}
interface Message {
id: string
title: string
content: string
notify: boolean
}
interface CrawlerResult {
file: Array<File>
message: Array<Message>
}
export async function crawl (): Promise<CrawlerResult> {
const baseUrl = 'https://www.domgymnasium-verden.de/vertretungsplan.html'
const content = await request(baseUrl)
const $ = cheerio.load(content)
const message: Array<Message> = []
$('.ce_text').each((_, element) => {
const title = $(element).find('h1').text().trim()
const content = $(element).clone() // clone the element
.find('h1') // select the headline
.remove() // remove the headline
.end() // again go back to selected element
.text() // get text
.trim() // remove whitespaces
message.push({
title,
content,
id: generateId(title),
notify: true
})
})
const fileBase: Array<{
id: string
title: string
url: string
type: 'plan' | 'download'
notify: boolean
}> = []
$('.ce_downloads').each((index, element) => {
$(element).find('li').find('a').each((_, element) => {
const itemUrl = resolve(baseUrl, $(element).attr('href'))
let title = $(element)
.clone() // clone the element
.children() // select all the children
.remove() // remove all the children
.end() // again go back to selected element
.text()
if (title.indexOf('.') !== -1) {
title = title.substring(0, title.lastIndexOf('.'))
}
fileBase.push({
id: generateId(itemUrl),
title: title,
url: itemUrl,
type: (index === 0 ? 'plan' : 'download'),
notify: index === 0
})
})
})
const file = await Promise.all(fileBase.map(async (file): Promise<File> => {
const info = await getUrlInfo(file.url)
return {
id: file.id,
notify: file.notify,
type: file.type,
lastModified: info.lastModified,
mimeType: info.mimeType,
title: file.title,
file: [{
url: file.url,
sha512: info.sha512,
size: info.size
}]
}
}))
return {
message,
file
}
}