import gradio as gr
import requests
import re
import os
# Define the function to call the API
API_KEY = os.getenv('API_KEY')
sample_prompts = [["Ensure the `property_type` column contains only distinct values matching `['Residential', 'Commercial', 'Industrial']`."],
["For the `billing_amount` column: Ensure the sum is less than or equal to 5000 and the average is between 50 and 200."],
["For the `policy_number` field: Ensure all entries are unique and formatted as 'INS-XXXXXX'. Validate that this field does not contain any null values."],
["For field 'metadata': Ensure the field contains valid JSON objects and that parsing does not raise exceptions."],
["Ensure that the `score` column contains integer values only and that they are greater than or equal to zero."],
["For the `employee_id` field: Ensure that all values are unique and fall within the specified set of valid employee IDs."],
["Check that the total count of emails in the `emails` table equals the count of unique addresses in the `recipients` column."],
["For the `sender_id` and `receiver_id` columns: Ensure that each ID in `sender_id` exists in the `users` table and is unique across records."],
["For fields `likes` and `shares`: Ensure the sum of `likes` and `shares` does not exceed `engagement_score`. Ensure all fields are of type 'integer' and not null."],
["Ensure the `employee_id` column has distinct values and all fall within the range of 1 to 5000."],
["Validate that the `metadata` column contains valid JSON objects that adhere to the specified schema."],
["For the `email` field: Ensure it follows a valid email format and contains unique values. Confirm the field is required."],
]
sample_ge = ["expect_column_values_to_be_in_set(column='property_type', value_set=['Residential', 'Commercial', 'Industrial'])",
"expect_column_sum_to_be_between(column='billing_amount', min_value=0, max_value=5000),expect_column_mean_to_be_between(column='billing_amount', min_value=50, max_value=200)",
"expect_column_values_to_be_unique(column='policy_number'),expect_column_values_to_match_regex(column='policy_number', regex=r'^INS-\d{8}$'),expect_column_values_to_not_be_null(column='policy_number')",
"expect_column_values_to_be_json_parseable(column='metadata'),expect_column_values_to_not_be_null(column='metadata') ",
"expect_column_values_to_be_of_type(column='score', type_='int'),expect_column_values_to_be_between(column='score', min_value=0, max_value=None)"]
article_content = """
Key Features
- Custom Validation Rules: Input any data quality validation rule, and the app will generate corresponding Great Expectations expectations.
- Comprehensive Validation Support: From validating column values, sums, and averages, to ensuring unique values and matching regex patterns, this app supports a wide range of data quality checks.
- User-Friendly Interface: Simply enter your validation rule prompt, and our app will convert it into an easy-to-implement Great Expectations rule.
- Live Conversion: Adjust the parameters like temperature and min_p to fine-tune the conversion process, with real-time results.
"""
Examples = {"Prompts": sample_prompts, "Great Expectations":sample_ge}
def query_vllm(prompt, use_cache=True, temperature=0.1, min_p=0.1):
api_url = "https://api.runpod.ai/v2/3xt35aqk9wnzjz/runsync"#"https://m5ypgg3uebl8r4hj.us-east-1.aws.endpoints.huggingface.cloud/v1/chat/completions" # "http://172.17.0.2:8000/v1/chat/completions"
headers = {"Content-Type": "application/json", "Accept" : "application/json", 'Authorization': f'Bearer {API_KEY}'}
payload = {
"input":{
"prompt":prompt,
"temperature": temperature,
"min_p": min_p,
}
}
try:
response = requests.post(api_url, headers=headers, json=payload)
response_data = response.json()
# Check if "output" and "choices" are in the response data
if "output" in response_data and len(response_data["output"]) > 0:
raw_output = response_data["output"][0]["choices"][0]["tokens"][0]
# Clean up the raw output (optional)
cleaned_output = raw_output.strip() #
cleaned_output = '\n'.join(line for line in cleaned_output.splitlines() if line.startswith('expect_') or line.startswith('#')).strip()
cleaned_output = cleaned_output.replace('Cutting Knowledge Date: December 2023','')
# Return the formatted output
return f"Expectation:\n{cleaned_output}"
else:
# Handle missing 'output' or 'choices' key in the response
return f"Error: {response_data.get('message', 'Unknown error')}"
except Exception as e:
return f"Error: {e}"
# Create a Gradio interface
interface = gr.Interface(
fn=query_vllm,
inputs=[
gr.Textbox(
label="Prompt",
placeholder="Describe your data quality rule(s). Example: 'Column `age` must be non-null and between 0 and 120.'",
lines=6
),
gr.Checkbox(label="Use Cache", value=True),
gr.Slider(label="Temperature", minimum=0.1, maximum=0.5, step=0.01, value=0.1),
gr.Slider(label="Min P (ฮผP sampling)", minimum=0.0, maximum=0.4, step=0.1, value=0.1),
],
outputs="text",
title="๐ Data Quality โ Great Expectations Converter",
description="""
### Welcome! ๐
Quickly turn your **data validation rules** into **Great Expectations** suitesโready to plug into your pipelines.
- โ
Consistent, reliable checks
- ๐ Easy pipeline integration
- ๐งช Clear, reproducible validations
> **Heads-up:** first run might take a moment while the server wakes up.
> Weโre running in serverless mode to keep things light & cost-friendly. ๐๐๐
""",
article=article_content,
live=False,
examples=sample_prompts,
cache_examples=True,
examples_per_page=6,
theme="default",
api_name="generate",
analytics_enabled=True,
css="""
/* Subtle card-like wrapper for the output */
.svelte-1ipelgc, .wrap .output-class {
border-radius: 16px;
box-shadow: 0 6px 24px rgba(0,0,0,0.06);
}
/* Make the title pop a bit */
h1, h5 {
letter-spacing: 0.2px;
}
/* Nicer spacing for the description list */
.prose ul { margin-top: 0.5rem; }
.prose blockquote {
border-left: 4px solid rgba(0,0,0,0.08);
padding-left: 0.8rem;
color: rgba(0,0,0,0.7);
}
"""
)
# Launch the Gradio app
if __name__ == "__main__":
interface.launch(share=True)