Refactor code for new DB interface #8

Merged
S1SYPHOS merged 17 commits from refactor into main 5 months ago
  1. 5
      .gitignore
  2. 95
      README.md
  3. 7
      ajum/__init__.py
  4. 1084
      ajum/ajum.py
  5. 592
      ajum/cli.py
  6. 39
      ajum/helpers.py
  7. 185
      ajum/utils.py
  8. 9
      example.config.json
  9. 180347
      index.json

5
.gitignore vendored

@ -141,7 +141,10 @@ dmypy.json
cython_debug/
# Cache files
database.json
/__pycache__/
/.doit.db
/.db/
# AJuM
database.json
config.json

@ -5,6 +5,10 @@ This small library is a Python wrapper for [ajum.de](https://www.ajum.de/rezensi
We deem their work to be invaluable for kindergartens, (pre)schools, universities and other educational institutions. We are thankful for AJuM's commitment and want to give something back by spreading the word and provide an easy way to interact with their review database.
**Note:** Since we DO NOT UNDER ANY CIRCUMSTANCES want to disrupt their services, every API calls `sleep()`.
The included `index.json` file contains URL slugs for each ISBN. It was created using `--strict` mode, skipping invalid ISBNs - currently totalling 46203 (valid) ISBNs with 87939 reviews (averaging 1.90 reviews per ISBN).
## Getting started
@ -33,8 +37,11 @@ from ajum import Ajum
# Initialize object
ajum = Ajum()
# Fetch review data
data = ajum.get_review('SOME_ID')
# Fetch reviews from first page
slugs = ajum.get_slugs():
# Display their data:
print(ajum.get_reviews(slugs))
```
For more examples, have a look at `src/cli.py` and `src/ajum.py` to get you started - feedback appreciated, as always!
@ -51,20 +58,17 @@ Usage: ajum [OPTIONS] COMMAND [ARGS]...
Tools for interacting with the 'AJuM' database.
Options:
-t, --timer FLOAT Waiting time after each request.
-f, --is_from TEXT "From" header.
-u, --user-agent TEXT User agent.
-v, --verbose Enable verbose mode.
--version Show the version and exit.
--help Show this message and exit.
-c, --config PATH Path to user settings file.
-u, --ua PATH Path to "UA" strings file.
-v, --verbose Enable verbose mode.
--version Show the version and exit.
--help Show this message and exit.
Commands:
backup Backs up remote database
build Builds local database DB_FILE from INDEX_FILE
clear Removes cached results files
index Exports index of reviews per ISBN to INDEX_FILE
query Queries remote database
show Shows data of given REVIEW
export Exports review data to FILE
show Shows data for given ISBN
stats Shows statistics
update Updates local database
```
@ -83,79 +87,75 @@ Usage: ajum backup [OPTIONS]
Backs up remote database
Options:
-a, --archived Include all reviews.
-h, --html-file PATH HTML results file.
--help Show this message and exit.
-p, --parallel INTEGER Number of parallel downloads.
--help Show this message and exit.
```
### `index`
### `update`
.. reviews per ISBN:
.. local database:
```text
$ ajum index --help
Usage: ajum index [OPTIONS] [INDEX_FILE]
$ ajum update --help
Usage: ajum update [OPTIONS]
Exports index of reviews per ISBN to INDEX_FILE
Updates local database
Options:
-s, --strict Whether to skip invalid ISBNs.
-j, --jobs INTEGER Number of threads.
--help Show this message and exit.
-n, --number INTEGER Number of results pages to be scraped.
-p, --parallel INTEGER Number of parallel downloads.
--help Show this message and exit.
```
### `build`
### `export`
.. local database:
.. review data as index (or full database):
```text
$ ajum build --help
Usage: ajum build [OPTIONS] [INDEX_FILE] [DB_FILE]
$ ajum export --help
Usage: ajum export [OPTIONS] [FILE]
Builds local database DB_FILE from INDEX_FILE
Exports review data to FILE
Options:
-j, --jobs INTEGER Number of threads.
-s, --strict Whether to skip invalid ISBNs.
-f, --full Whether to export full database.
-j, --jobs INTEGER Number of jobs.
--help Show this message and exit.
```
### `show`
.. review data for given ID:
.. review data for given ISBN:
```text
$ ajum show --help
Usage: ajum show [OPTIONS] REVIEW
Usage: ajum show [OPTIONS] ISBN
Shows data of given REVIEW
Shows data for given ISBN
Options:
--help Show this message and exit.
-f, --file PATH Path to database file.
--help Show this message and exit.
```
### `query`
### `stats`
.. remote database for given search terms:
.. about (cached) reviews:
```text
$ ajum query --help
Usage: ajum query [OPTIONS]
$ ajum stats --help
Usage: ajum stats [OPTIONS]
Queries remote database
Shows statistics
Options:
-s, --search-term TEXT Search term.
-t, --title TEXT Book title.
-f, --first-name TEXT First name of author.
-l, --last-name TEXT Last name of author.
-i, --illustrator TEXT Name of illustrator.
-a, --all-reviews Include all reviews.
-w, --wolgast Include only Wolgast laureates.
--help Show this message and exit.
-f, --file FILENAME Path to index file.
--help Show this message and exit.
```
@ -170,7 +170,8 @@ Usage: ajum clear [OPTIONS]
Removes cached results files
Options:
--help Show this message and exit.
-r, --reset Whether to remove cached results pages.
--help Show this message and exit.
```

@ -0,0 +1,7 @@
from .ajum import Ajum
__all__ = [
# Main class
'Ajum',
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -1,39 +0,0 @@
import os
import json
import hashlib
def create_path(path):
# Determine if (future) target is appropriate data file
if os.path.splitext(path)[1].lower() == '.json':
path = os.path.dirname(path)
if not os.path.exists(path):
try:
os.makedirs(path)
# Guard against race condition
except OSError:
pass
def load_json(json_file):
try:
with open(json_file, 'r') as file:
return json.load(file)
except json.decoder.JSONDecodeError:
raise Exception
return {}
def dump_json(data, json_file):
create_path(json_file)
with open(json_file, 'w') as file:
json.dump(data, file, ensure_ascii=False, indent=4)
def dict2hash(source: dict) -> str:
return hashlib.md5(str(source).encode('utf-8')).hexdigest()

@ -0,0 +1,185 @@
import os
import json
import logging
from asyncio.locks import Semaphore
from hashlib import md5
from logging.handlers import RotatingFileHandler
from typing import Any, List, Union
from aiofiles import open as async_open
def create_path(path: str) -> None:
"""
Creates path recursively
:param path: str Path to be created
:return: None
"""
# If target is file ..
if os.path.splitext(path)[1].lower() in ['.html', '.json']:
# .. use its directory as target
path = os.path.dirname(path)
# Create it
if not os.path.exists(path):
try:
os.makedirs(path)
# Guard against race condition
except OSError:
pass
def flatten(data: list[list]) -> List[Any]:
"""
Flattens list of lists (async version)
:param data: list List of lists
:return: list Flattened list
"""
return [item for sublist in data for item in sublist]
def get_logger(log_dir: str, log_name: str) -> logging.Logger:
"""
Initializes logger & implements logfile rotation
:param log_dir: str Logs directory
:param log_name: str Log filename
:return: logging.Logger
"""
# Initialize logger
logger = logging.getLogger(__name__)
# Set loglevel
logger.setLevel(logging.INFO)
# Configure log handler
log_handler = RotatingFileHandler(
filename = os.path.join(log_dir, log_name),
maxBytes = 1024 * 1024,
backupCount = 5
)
# Implement formatting
log_handler.setFormatter(logging.Formatter('%(asctime)s %(name)s: %(levelname)-8s %(message)s'))
# Add log handler to logger
logger.addHandler(log_handler)
return logger
def data2hash(data: Any) -> str:
"""
Builds hash over given data
:param data: typing.Any Data
:return: str Hash
"""
return md5(str(data).encode('utf-8')).hexdigest()
async def load_html(html_file: str, max_files: Semaphore) -> str:
"""
Loads data from HTML file (async version)
:param html_file: str Path to HTML file
:param max_files: asyncio.locks.Semaphore File opening limit
:return: str HTML data
"""
# Respect file opening limit when ..
async with max_files:
# .. loading data
async with async_open(html_file, 'r') as file:
return await file.read()
async def dump_html(data: str, html_file: str, max_files: Semaphore) -> None:
"""
Dumps HTML data to file
:param data: str Data
:param html_file: str Path to HTML file
:param max_files: asyncio.locks.Semaphore File opening limit
:return: None
"""
# Create path (if needed)
create_path(html_file)
# Respect file opening limit when ..
async with max_files:
# .. storing data
async with async_open(html_file, 'w') as file:
await file.write(data)
async def load_json(json_file: str, max_files: Semaphore) -> Union[dict, list]:
"""
Loads data from JSON file (async version)
:param json_file: str Path to JSON file
:param max_files: asyncio.locks.Semaphore File opening limit
:return: dict|list Data
:raises: Exception Decoding error
"""
try:
# Respect file opening limit when ..
async with max_files:
# .. loading data
async with async_open(json_file, 'r') as file:
return json.loads(await file.read())
except json.decoder.JSONDecodeError:
raise Exception
async def dump_json(data: Union[dict, list], json_file: str, max_files: Semaphore) -> None:
"""
Dumps JSON data to file
:param data: dict|list Data
:param json_file: str Path to JSON file
:param max_files: asyncio.locks.Semaphore File opening limit
:return: None
"""
# Create path (if needed)
create_path(json_file)
# Respect file opening limit when ..
async with max_files:
# .. storing data
async with async_open(json_file, 'w') as file:
await file.write(json.dumps(data, ensure_ascii=False, indent=4))
def list2chunks(data: list, size: int) -> List[list]:
"""
Splits list into chunks
:param data: list Data to be split
:param size: int Chunk size
:return: list<list> Chunks
"""
# Split data into smaller chunks
return [data[i:i + size] for i in range(0, len(data), size)]

@ -0,0 +1,9 @@
{
"headers": {
"From": "your@email.com",
"Referer": "twobrain.io",
"User-Agent": "your-UA-string"
},
"timeout": 10,
"wait": 5
}

180347
index.json

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save