/*
|
|
* vertretungsplan.io custom crawler
|
|
* Copyright (C) 2019 Jonas Lochmann
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU Affero General Public License as
|
|
* published by the Free Software Foundation, version 3 of the
|
|
* License.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU Affero General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Affero General Public License
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
import * as cheerio from 'cheerio'
|
|
import * as request from 'request-promise-native'
|
|
import { resolve } from 'url'
|
|
import { generateId } from '../generate-id'
|
|
import { getUrlInfo } from '../get-url-info'
|
|
|
|
interface File {
|
|
type: 'plan' | 'download'
|
|
title: string
|
|
mimeType: string
|
|
lastModified?: number
|
|
file: [{
|
|
url: string
|
|
sha512: string
|
|
size: number
|
|
}]
|
|
id: string
|
|
notify: boolean
|
|
}
|
|
|
|
interface Message {
|
|
id: string
|
|
title: string
|
|
content: string
|
|
notify: boolean
|
|
}
|
|
|
|
interface CrawlerResult {
|
|
file: Array<File>
|
|
message: Array<Message>
|
|
}
|
|
|
|
export async function crawl (): Promise<CrawlerResult> {
|
|
const baseUrl = 'https://www.domgymnasium-verden.de/vertretungsplan.html'
|
|
const content = await request(baseUrl)
|
|
const $ = cheerio.load(content)
|
|
|
|
const message: Array<Message> = []
|
|
|
|
$('.ce_text').each((_, element) => {
|
|
const title = $(element).find('h1').text().trim()
|
|
const content = $(element).clone() // clone the element
|
|
.find('h1') // select the headline
|
|
.remove() // remove the headline
|
|
.end() // again go back to selected element
|
|
.text() // get text
|
|
.trim() // remove whitespaces
|
|
|
|
message.push({
|
|
title,
|
|
content,
|
|
id: generateId(title),
|
|
notify: true
|
|
})
|
|
})
|
|
|
|
const fileBase: Array<{
|
|
id: string
|
|
title: string
|
|
url: string
|
|
type: 'plan' | 'download'
|
|
notify: boolean
|
|
}> = []
|
|
|
|
$('.ce_downloads').each((index, element) => {
|
|
$(element).find('li').find('a').each((_, element) => {
|
|
const itemUrl = resolve(baseUrl, $(element).attr('href'))
|
|
let title = $(element)
|
|
.clone() // clone the element
|
|
.children() // select all the children
|
|
.remove() // remove all the children
|
|
.end() // again go back to selected element
|
|
.text()
|
|
|
|
if (title.indexOf('.') !== -1) {
|
|
title = title.substring(0, title.lastIndexOf('.'))
|
|
}
|
|
|
|
fileBase.push({
|
|
id: generateId(itemUrl),
|
|
title: title,
|
|
url: itemUrl,
|
|
type: (index === 0 ? 'plan' : 'download'),
|
|
notify: index === 0
|
|
})
|
|
})
|
|
})
|
|
|
|
const file = await Promise.all(fileBase.map(async (file): Promise<File> => {
|
|
const info = await getUrlInfo(file.url)
|
|
|
|
return {
|
|
id: file.id,
|
|
notify: file.notify,
|
|
type: file.type,
|
|
lastModified: info.lastModified,
|
|
mimeType: info.mimeType,
|
|
title: file.title,
|
|
file: [{
|
|
url: file.url,
|
|
sha512: info.sha512,
|
|
size: info.size
|
|
}]
|
|
}
|
|
}))
|
|
|
|
return {
|
|
message,
|
|
file
|
|
}
|
|
}
|