# https://rajasgs.gitbook.io/pynotes/gcp#data-modeling
# https://github.com/tactlabs/student-hiring-prediction-mle/tree/main/dataset
import pyutil as pyu
pyu.get_local_pyinfo()
'conda env: ml311; pyv: 3.11.10 (main, Oct 3 2024, 07:29:13) [GCC 11.2.0]'
!pip show duckdb | grep "Version:"
# Connect to DuckDB
con = duckdb.connect()
# Load CSV file
con.execute("CREATE TABLE my_table AS SELECT * FROM '../dataset/student_data.csv'")
<duckdb.duckdb.DuckDBPyConnection at 0x7ff46ea2adf0>
# Perform SQL query
result = con.execute("SELECT * FROM my_table WHERE student_name = 'Patricia Hall'").fetchall()
[(18, 'Patricia Hall', 71, 75, 70, 73, 0)]
[(18, 'Patricia Hall', 71, 75, 70, 73, 0)]
import pandas as pd
from IPython.display import display
def list2table(cdata, col_list = None):
first_row = cdata[0]
cols_count = len(first_row)
if not col_list:
col_list = []
for idx in range(cols_count):
col_list.append(f'col_{idx}')
# Convert the list of tuples to a DataFrame
df1 = pd.DataFrame(cdata, columns = col_list)
# Set display options
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
# Apply custom styling
styled_df = df1.style.set_properties(**{'text-align': 'center'})
# Display the DataFrame
display(styled_df)
def q(query, col_list = None):
result = con.execute(query).fetchall()
# print(result)
return list2table(result, col_list)
q("SELECT * FROM my_table WHERE student_id = 1")
| |
col_0 |
col_1 |
col_2 |
col_3 |
col_4 |
col_5 |
col_6 |
| 0 |
1 |
John Doe |
85 |
90 |
80 |
88 |
1 |
q("""
SELECT
STUDENT_NAME
FROM my_table
limit 2
""", ["student_name"])
| |
student_name |
| 0 |
John Doe |
| 1 |
Jane Smith |
q("""
SELECT
AVG(test_scores)
FROM my_table
limit 2
""",
["one"])
q("""
SELECT
"student_name"
FROM my_table
limit 2
""",
["one"])
| |
one |
| 0 |
John Doe |
| 1 |
Jane Smith |
q("""
SELECT
student_id
FROM my_table
WHERE student_name IN ('John Doe', 'Jane Smith')
""",
["one"])
Score: 15