You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 

88 lines
2.5 KiB

/*
* vertretungsplan.io custom crawler
* Copyright (C) 2019 Jonas Lochmann
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, version 3 of the
* License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
import { createHash } from 'crypto'
import * as LRU from 'lru-cache'
import * as request from 'request'
interface UrlInfo {
lastModified?: number
mimeType: string
sha512: string
etag?: string
size: number
}
const cache = new LRU<string, UrlInfo>({
max: 1000
})
export async function getUrlInfo (url: string) {
const oldData = cache.get(url)
const newData = await getUrlInfoInternal(url, oldData)
cache.set(url, newData)
return newData
}
function getUrlInfoInternal (url: string, lastResponse?: UrlInfo): Promise<UrlInfo> {
const hash = createHash('sha512')
let receivedResponse: request.Response | undefined = undefined
let byteCounter = 0
return new Promise<UrlInfo>((resolve, reject) => {
request({
url,
headers: lastResponse && lastResponse.etag ? {
'If-None-Match': lastResponse.etag
} : {}
}).on('response', (response: request.Response) => {
receivedResponse = response
if (response.statusCode === 304) {
// not modified
resolve(lastResponse)
return
}
if (response.statusCode !== 200) {
reject(new Error('server reported ' + response.statusCode))
return
}
}).on('data', (data: Buffer) => {
byteCounter += data.length
}).on('end', () => {
if (!receivedResponse) {
reject(new Error('illegal state'))
return
}
const etag = receivedResponse.headers['etag']
resolve({
sha512: hash.digest('hex'),
lastModified: new Date(receivedResponse.headers['last-modified'] || 'invalid').getTime(),
mimeType: receivedResponse.headers['content-type'] || 'application/unknown',
etag: typeof etag === 'string' ? etag : undefined,
size: byteCounter
})
}).on('error', (ex) => reject(ex)).pipe(hash)
})
}