Created
April 18, 2023 17:38
-
-
Save kalloc/e13226001c58e5fe43f371d0c98f4d4b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import pymysql | |
import pymysql.cursors | |
import requests | |
from lxml import html | |
DB_PASS = os.environ.get('DB_PASS') | |
DB_NAME = os.environ.get('DB_NAME') | |
DB_HOST = os.environ.get('DB_HOST') | |
DB_USER = os.environ.get('DB_USER') | |
if not DB_PASS or not DB_NAME or not DB_HOST or not DB_USER: | |
print('Please set DB_PASS, DB_NAME, DB_HOST, DB_USER') | |
exit(1) | |
# Connect to the database | |
connection = pymysql.connect(host=DB_HOST, | |
user=DB_USER, | |
password=DB_PASS, | |
database=DB_NAME, | |
cursorclass=pymysql.cursors.DictCursor) | |
def getActualPdfLink(id): | |
url = f'http://delta.rsl.ru/info/show/mrc/rsl01/{id}' | |
r = requests.get(url) | |
body = r.text | |
tree = html.fromstring(body) | |
pdf_link = tree.xpath("//span[@class='fieldName'][text()='856']/following-sibling::span[@class='data']/text()") | |
return pdf_link[1] | |
def do_update_link(connection, result, actualPdfLink): | |
print('Updating pdfLink', idFromALIS) | |
sql = "UPDATE `tbl_common_biblio_card` SET `pdfLink`=%s WHERE `id`=%s" | |
cursor.execute(sql, (actualPdfLink, result['id'])) | |
sql2 = "INSERT INTO `tbl_indexed` (`FullSymbolicId`, `ALIS`, `is_indexed`) VALUES (%s, %s, false)" + \ | |
" ON DUPLICATE KEY UPDATE `is_indexed`=false"; | |
cursor.execute(sql2, (result['FullSymbolicId'], result['ALIS'])) | |
def do_force_reindex(connection, result): | |
print('pdfLink is actual, request reindex', idFromALIS) | |
sql2 = "INSERT INTO `tbl_indexed` (`FullSymbolicId`, `ALIS`, `is_indexed`) VALUES (%s, %s, false)" + \ | |
" ON DUPLICATE KEY UPDATE `is_indexed`=false"; | |
cursor.execute(sql2, (result['FullSymbolicId'], result['ALIS'])) | |
def get_doc_info(connection, idFromALIS): | |
# Read a single record | |
sql = "SELECT `id`, `FullSymbolicId`, `ALIS`, `pdfLink` FROM `tbl_common_biblio_card` WHERE `idFromALIS`=%s" | |
cursor.execute(sql, (idFromALIS,)) | |
result = cursor.fetchone() | |
return result | |
with connection: | |
if len(sys.argv) < 2: | |
print('Usage: python3 index.py to_be_indexed.txt') | |
exit(1) | |
filename = sys.argv[1] | |
with open('to_be_indexed.txt', 'r') as file: | |
for idFromALIS in file: | |
idFromALIS = idFromALIS.strip() | |
with connection.cursor() as cursor: | |
result = get_doc_info(connection, idFromALIS) | |
if not result: | |
print('No such idFromALIS', idFromALIS) | |
continue | |
actualPdfLink = getActualPdfLink(idFromALIS) | |
if(result['pdfLink'] != actualPdfLink): | |
do_update_link(connection, result, actualPdfLink) | |
else: | |
do_force_reindex(connection, result) | |
connection.commit() | |
print('Done') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment