Added Export to Excel, uploadable to Outlands cloud with share link

2025-02-08 13:35:59 +01:00 · 2025-02-08 13:35:59 +01:00 · f5c34d9b0f
commit f5c34d9b0f
parent c94fa528fe
1 changed files with 345 additions and 23 deletions
--- a/02_oms_sql_pipeline.py
+++ b/02_oms_sql_pipeline.py
@ -1,12 +1,18 @@
-from typing import List, Union, Generator, Iterator, Dict
-from pydantic import BaseModel
+from typing import List, Union, Generator, Iterator
+from pydantic import BaseModel, Field
 from sqlalchemy import create_engine
 from sqlalchemy import text
-import pylightxl as xl
+from urllib.parse import urlencode
+from openpyxl import load_workbook
+from openpyxl.styles import Alignment, Font
+from openpyxl.drawing.image import Image
+from copy import copy
 import logging
 import os
 import requests
 import json
+import time
+import ast

 logging.basicConfig(level=logging.DEBUG)

@ -23,12 +29,23 @@ class Pipeline:
        DB_DATABASE: str
        DB_TABLES: List[str]

-        XL_TEMPLATE_PATH: str
+        KEYSTORE_URL: str
+        KEYSTORE_API: str
+
+        CLOUD_HOST: str
+        CLOUD_API_VERSION: str
+        CLOUD_EXCEL_LIBRARY: str
+        CLOUD_USER: str
+        CLOUD_PASS: str
+        CLOUD_EXCEL_TEMPLATE_FILENAME: str
+
+        TMP_STORAGE_DIR: str

    def __init__(self):
-        self.name = "ØMS Membership Database"
+        self.name = "01 PolarPress Database RAG"
        self.engine = None
        self.nlsql_response = ""
+        self.last_emit_time = 0
        self.valves = self.Valves(
            **{
                "pipelines": ["*"],
@ -41,7 +58,15 @@ class Pipeline:
                "DB_PASSWORD": os.getenv("DB_PASSWORD", "YOUR_PASSWORD"),
                "DB_DATABASE": os.getenv("DB_DATABASE", "pp_polarpress_demo_prod"),
                "DB_TABLES": ["users", "club_memberships", "stripe_transactions", "vipps_transactions"],
-                "XL_TEMPLATE_PATH": os.getenv("XL_TEMPLATE_PATH", "/var/support/openwebui/xl_templates/oms_dataexport.xlsx")
+                "KEYSTORE_URL": os.getenv("KEYSTORE_URL", "https://keystore.outlands.no"),
+                "KEYSTORE_API": os.getenv("KEYSTORE_API", "123"),
+                "CLOUD_HOST": os.getenv("CLOUD_HOST", "outlands.no"),
+                "CLOUD_API_VERSION": os.getenv("CLOUD_API_VERSION", "api2"),
+                "CLOUD_EXCEL_LIBRARY": os.getenv("CLOUD_EXCEL_LIBRARY", "exports"),
+                "CLOUD_USER": os.getenv("CLOUD_USER", "username"),
+                "CLOUD_PASS": os.getenv("CLOUD_PASS", "password"),
+                "CLOUD_EXCEL_TEMPLATE_FILENAME": os.getenv("CLOUD_EXCEL_TEMPLATE_FILENAME", "oms_eksportmal.xls"),
+                "TMP_STORAGE_DIR": os.getenv("TMP_STORAGE_DIR", "/home/heno/tmp"),
            }
        )

@ -56,10 +81,181 @@ class Pipeline:
    
    async def on_startup(self):
        self.init_db()
+        await self.sea_login()
+        await self.sea_check_library()
+        await self.sea_link()

    async def on_shutdown(self):
        pass

+    async def sea_login(self):
+        try:
+            headers= {
+                'Content-Type': 'application/x-www-form-urlencoded'
+            }
+            data = urlencode({'username': self.valves.CLOUD_USER, 'password': self.valves.CLOUD_PASS})
+
+            response = requests.post(
+                f"https://{self.valves.CLOUD_HOST}/{self.valves.CLOUD_API_VERSION}/auth-token/",
+                data=data,
+                headers=headers
+            )
+
+            try:
+                data = response.json()
+            except json.JSONDecodeError:
+                logging.error("Failed to decode json response connecting to the cloud")
+                return False
+            
+            if 'non_field_errors' in data:
+                if data['non_field_errors'][0] == 'Unable to login with provided credentials.':
+                    logging.error(f"Invalid username or password for user {self.valves.CLOUD_USER} on seafile host {self.valves.CLOUD_HOST} provided. Could not get auth token")
+                    return False
+            elif 'token' in data:
+                self.token = data['token']
+                print(f"Login to seafile cloud host {self.valves.CLOUD_HOST} successful")
+                return True
+            else:
+                logging.error("Unexpected response from seafile server")
+                return False
+        except requests.RequestException as e:
+            logging.error(f"Unable to connect to seafile cloud {self.valves.CLOUD_HOST}. Error: {e}")
+            return False
+        
+    async def sea_check_library(self) -> bool:
+        try:
+            headers = {
+                'Authorization': f"Token {self.token}",
+                'Accept': 'application/json; indent=4'
+            }
+
+            params = {
+                'type': 'mine'
+            }
+
+            response = requests.get(
+                f"https://{self.valves.CLOUD_HOST}/{self.valves.CLOUD_API_VERSION}/repos/",
+                headers=headers,
+                params=params
+            )
+
+            res = response.json()
+            library_exists = False
+            library_id = None
+            writable = True
+
+            for library in res:
+                if library['name'] == self.valves.CLOUD_EXCEL_LIBRARY:
+                    library_exists = True
+                    library_id = library['id']
+                    if library.get('encrypted') or library.get('permission') == 'r':
+                        writable = False
+
+            if not writable:
+                logging.error(f"The library {self.valves.CLOUD_EXCEL_LIBRARY} exists but is is not writable. Log in to {self.valves.CLOUD_HOST} from your browser and check it's permission settings")
+                return False
+            
+            if not library_exists:
+                make_library_response = requests.post(
+                    f"https://{self.valves.CLOUD_HOST}/{self.valves.CLOUD_API_VERSION}/repos/",
+                    headers=headers,
+                    json={
+                        'name': self.valves.CLOUD_EXCEL_LIBRARY
+                    }
+                )
+
+                if make_library_response.status_code == 400:
+                    logging.error(f"Could not create neccessary library {self.valves.CLOUD_EXCEL_LIBRARY} on your {self.valves.CLOUD_HOST} account. Contact a system administrator")
+                    return False
+                elif make_library_response.status_code == 520:
+                    logging.error(f"Could not create neccessary library {self.valves.CLOUD_EXCEL_LIBRARY} on your {self.valves.CLOUD_HOST} account. Contact a system administrator")
+                    return False
+                
+                self.library_id = make_library_response.json()['repo_id']
+                print(f"The library {self.valves.CLOUD_EXCEL_LIBRARY} was created", fg='yellow')
+                return True
+            
+            self.library_id = library_id
+            print(f"The library {self.valves.CLOUD_EXCEL_LIBRARY} exists, no need to create")
+            return True
+        except requests.RequestException as e:
+            logging.error(f"Unable to connect to cloud {self.valves.CLOUD_HOST}. Error: {e}")
+            return False
+        
+    async def sea_link(self) -> bool:
+        try:
+            response = requests.get(
+                f"https://{self.valves.CLOUD_HOST}/{self.valves.CLOUD_API_VERSION}/repos/{self.library_id}/upload-link/?p=/",
+                headers={'Authorization': 'Token {token}'.format(token=self.token)}
+            )
+
+            if response.status_code == 403:
+                logging.error(f"Lacking permissions to upload to library '{self.library_id}'. Doublecheck permissions by logging into https://{self.valves.CLOUD_HOST} and go to the library to check it's settings")
+                return False
+            elif response.status_code == 500:
+                logging.error(f"Could not get permission to upload. Check that you have enough storage permissions left on your {self.valves.CLOUD_HOST} account, and if not try to delete some old files first.")
+                return False
+
+            self.upload_link = response.json()
+            print(f"Recieved go ahead for upload on {self.valves.CLOUD_HOST}")
+            return True
+        except requests.RequestException as e:
+            logging.error(f"Unable to connect to cloud {self.valves.CLOUD_HOST}. Error: {e}")
+            return False
+        
+    def sea_upload(self, path: str, file_name: str) -> bool:
+        if not os.path.isfile(path):
+            print(f"Cannot read and upload '{path}'")
+            return False
+            
+        try:
+            requests.post(
+                self.upload_link, data={'filename': file_name, 'parent_dir': '/'},
+                files={'file': open(path, 'rb')},
+                headers={'Authorization': 'Token {token}'.format(token=self.token)}
+            )
+
+            print(f"Excel file '{file_name}' successfully uploaded and stored to your {self.valves.CLOUD_HOST}")
+            return True
+        except requests.RequestException as e:
+            logging.error(f"Unable to upload file to {self.valves.CLOUD_HOST}. Error: {e}")
+            return False
+        
+    def sea_share(self, file: str):
+        try:
+            headers = {
+                'Authorization': f'Token {self.token}',
+                'Content-Type': 'application/json',
+                'Accept': 'application/json; indent=4'
+            }
+
+            json_data = {
+                'repo_id': self.library_id,
+                'path': f'/{file}',
+                'permissions': {
+                    'can_edit': False,
+                    'can_download': True
+                },
+                'expire_days': 1
+            }
+
+            response = requests.post(
+                f"https://{self.valves.CLOUD_HOST}/api/v2.1/share-links/", 
+                json=json_data,
+                headers=headers
+            )
+
+            res = response.json()
+            
+            if response.status_code < 300:
+                print(f"Excel file '{file}' successfully shared: {res['link']}")
+                return res["link"]
+            else:
+                return False
+        except requests.RequestException as e:
+            logging.error(f"Unable to share {file}. Error: {e}")
+            return False
+
    def run_llm_query(self, message: str):
        try:
            response = requests.post(
@ -103,10 +299,40 @@ class Pipeline:
            with self.engine.connect() as connection:
                result = connection.execute(text(query))
                rows = result.fetchall()
-                return str(rows)
+                return rows
        except Exception as e:
            return {"error": str(e)}
        
+    def give_excel_book_description(self, message: str, query: str, result: str):
+        llm_instructions = f"""
+        Based on the input question, MySQL query and query result, give a nice text summary in NORWEGIAN
+        of the data you have extracted from the database. The text will be used in a Excel spreadsheet export
+
+        Input question {message}
+        MySQLQuery: {query}
+        Query result: {result}
+        """
+
+        return llm_instructions
+    
+    def generate_table_headers(self, message: str, query: str, result: str):
+        llm_instructions = f"""
+        Based on the input question, MySQL query and query result, give the columns in Query result the appropriate
+        header names translated to NORWEGIAN.
+
+        IMPORTANT: Return a python list formatted as string with ONLY the column header names, not the resut of the query result.
+        DO NOT encapsulate your reply in markdown or any other formatting beside the python list formatted as a string. 
+        Example of valid string to generate: ["ID", "Navn", "Epostadresse", "Dato opprettet"]
+
+        Input question {message}
+        MySQLQuery: {query}
+        Query result: {result}
+
+        Python list formatted string of table header names for columns as they appear in "Query result":
+        """
+
+        return llm_instructions
+        
    def reformat_data(self, message: str, query: str, result: str):
        llm_reformat_instructions = f"""
        Given an input question, create a syntactically correct mysql query to run. You have 4 tables to work with:
@ -178,22 +404,118 @@ class Pipeline:
        MySQLQuery:
        """

-        initial = self.run_llm_query(llm_initial_instructions)
-        if initial["success"]:
-            query = initial["data"]
-            query_result = self.run_mysql_query(query)
-            
-            if isinstance(query_result, dict) and "error" in query_result:
-                return f"Error occurred: {query_result['error']}. Initial data: {initial['data']}"
-            
-            formatted = self.reformat_data(user_message, query, query_result)
-            formatted_result = self.run_llm_query(formatted)
+        # Vi sjekker om body['stream'] er sann som indikerer at pipelinen kjører brukerens forespørsel
+        if body['stream']:

-            data = formatted_result["data"]
-            if formatted_result["success"]:
-                return data
+            # Vi kjører første inferens med instruksjonene i llm_initial_instructions for å fortelle
+            # KI modellen at vi ønsker en SQL spørring til svar basert på user_message (det brukeren
+            # skrev/spurte i chatten)
+            initial = self.run_llm_query(llm_initial_instructions)
+
+            # Hvis inferensen er vellykket, fortsett til neste trinn
+            if initial["success"]:
+
+                # Query inneholder SQL spørringen som er generert av KI modellen
+                query = initial["data"]
+
+                # Vi kjører SQL spørrringen og lagrer resultatet og rådataene i en python toupple datasett
+                query_result = self.run_mysql_query(query)
+
+                # Instruer KI om å generere en beskrivelse for regnearket, samt kolonnenavn
+                instruction_result = self.run_llm_query(
+                    self.give_excel_book_description(user_message, query, str(query_result))
+                )
+
+                reply_description = instruction_result["data"]
+
+                instruction_result = self.run_llm_query(
+                    self.generate_table_headers(user_message, query, str(query_result))
+                )
+
+                header_names = instruction_result["data"]
+
+                # Legg til dataene i excel arket
+                original_file_path = f"{self.valves.TMP_STORAGE_DIR}/{self.valves.CLOUD_EXCEL_TEMPLATE_FILENAME}"
+                excel_start_row = 4
+                wb = load_workbook(original_file_path)
+                ws = wb.active
+
+                # Kopier logoen til ØMS til det nye regnearket
+                img = Image(f"{self.valves.TMP_STORAGE_DIR}/oms-glow.png")
+                img.height = 145
+                img.width = 139
+                img.anchor = 'A1'
+                ws.add_image(img)
+                
+
+                # Sett overskrift
+                headline = "Østfold Milsim KI dataeksport"
+                ws.merge_cells('B1:E1')
+                cell = ws.cell(row=1, column=2, value=headline)
+                cell.alignment = Alignment(horizontal='left', vertical='top', wrap_text=True)
+                cell.font = Font(size=22, bold=True)
+
+                # Siden reply_description er en forhåndsvis stort volum med tekst, juster stilen og egenskapene til cellen
+                ws.merge_cells('A2:E2')
+                cell = ws.cell(row=2,column=1, value=reply_description)
+                cell.alignment = Alignment(horizontal='left', vertical='top', wrap_text=True)
+                cell.font = Font(size=10)
+                ws.row_dimensions[2].height = 300
+                ws.row_dimensions[1].height = 145
+
+                header_list = ast.literal_eval(header_names)
+                for i, row_data in enumerate(header_list, start=1):
+                    cell = ws.cell(row=3, column=i, value=row_data)
+                    cell.font = Font(bold=True)
+
+                for i, row_data in enumerate(query_result, start=excel_start_row):
+                    for j, value in enumerate(row_data, start=1):  # Column A corresponds to index 1
+                        ws.cell(row=i, column=j, value=value)
+
+                # Lagre og last opp excel arket til skyen
+                timestamp = int(time.time() * 1000) # Datotidsstempel i UNIX tid med millisekunder for å gi filenavnet et unikt navn
+                new_file_name = f"ny_eksport_{timestamp}.xlsx"
+                new_file_path = f'{self.valves.TMP_STORAGE_DIR}/{new_file_name}'
+                wb.save(new_file_path)
+                self.sea_upload(new_file_path, new_file_name)
+
+                # Instruer skyen om å opprette en deling av filen
+                share_file = self.sea_share(new_file_name)
+                
+                # Hvis det oppstod en syntaksfeil eller annen feil med spørringen, returner feilen i chatten
+                if isinstance(query_result, dict) and "error" in query_result:
+                    return f"Error occurred: {query_result['error']}. Initial data: {initial['data']}"
+                
+                # formatted vil inneholde andre trinns instruksjoner for KI modellen, inkludert:
+                # 1. user_message - brukerens innledende forespørsel til KI modellen
+                # 2. query - SQL spørringen KI modellen genererte i første inferens trinn
+                # 3. query_result - Rådata fra databasen
+                formatted = self.reformat_data(user_message, query, str(query_result))
+
+                # Vi kjører andre inferens med andre trinns instruksjoner for KI modellen
+                formatted_result = self.run_llm_query(formatted)
+
+                # Data vil inneholde KI modellens svar fra andre inferens trinn
+                data = formatted_result["data"]
+
+                # Legg ved linken til excel arket i chatten
+                if share_file:
+                    data += f"\n\n## Eksport:\nI tilfelle resultatet fra spørringen overgår begrensningene i antall tokens i svaret, har jeg eksportert dataene du spurte om i et eget excel ark hvor du kan se hele datasettet. Du kan laste ned Excel arket her:\n1. [Regneark delelink på outlands.no]({share_file})"
+
+                # Hvis resultatet er vellykket returner andre inferens trinn svar i chatten
+                if formatted_result["success"]:
+                    return data
+                
+                return f"Error occured: {data}"
            
-            return f"Error occured: {data}"
+            # Hvis første trinn av inferensen ikke var velykket, returner feilmelding i chatten
+            else:
+                data = initial["data"]
+                return f"Error occured: {data}"
+            
+        # Hvis body['stream'] ikke er sann, indikerer det at open webui kjører inferens for å generere
+        # tittel for chatten, eller autocomplete. I så tilfelle vil vi bare kjøre inferens på 
+        # brukerens melding og ikke selve databaseforespørselen
        else:
-            data = initial["data"]
-            return f"Error occured: {data}"
+            response = self.run_llm_query(user_message)
+            return response["data"]