transkribus

List documents within collection and nrOfNew pages (not transcribed yet)

import requests,json,os,sys

collection_id = 297657   # Burgerlijke Stand
htr_model_id = 58997        # Dutch Demeter 1

base_url = 'https://transkribus.eu/TrpServer/rest'
username = os.getenv("TRANSKRIBUS_USER")
password = os.getenv("TRANSKRIBUS_PASS")

auth_response = requests.post(f"{base_url}/auth/login", data={'user': username, 'pw': password}, headers = {'Accept': 'application/json'})
auth_token = auth_response.json()['sessionId']

headers = {
    #'Authorization': f'Bearer {auth_token}', # something wrong here?
    'Content-Type': 'application/json',
    'Accept': 'application/json',
    'Cookie': f'JSESSIONID={auth_token}'
}

documents_response = requests.get(f"{base_url}/collections/{collection_id}/list", headers = headers)
documents_info = documents_response.json()

for doc in documents_info:
    doc_id = doc["docId"]    
    documents_response = requests.get(f"{base_url}/collections/{collection_id}/{doc_id}/fulldoc", headers = headers)
    doc_info = documents_response.json()["md"]
    title = doc_info["title"]
    print(title, doc_info["nrOfNew"]))