ssctopper/generators/reasoning_generator.py

#!/usr/bin/env python3
"""
General Intelligence & Reasoning Question Generator for SSC CGL.
Generates ~25,000 template-based reasoning questions.
"""
import random
import string
import sys
import os

sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
from generators.base import make_question, get_qtid, nearby_wrong, get_db, insert_questions_batch

SUBJECT = "General Intelligence and Reasoning"


# ============ VERBAL REASONING ============

def gen_number_analogy(conn, count=700):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Verbal Reasoning", "Analogy", "Number analogy")
    if not qtid: return questions
    for _ in range(count):
        a = random.randint(2, 20)
        ops = [
            (a*a, lambda x: x*x, "square"),
            (a*a*a, lambda x: x*x*x, "cube"),
            (a*2, lambda x: x*2, "double"),
            (a+5, lambda x: x+5, "add 5"),
        ]
        result_a, func, rule = random.choice(ops)
        b = random.randint(2, 20)
        while b == a:
            b = random.randint(2, 20)
        result_b = func(b)
        questions.append(make_question(qtid,
            f"{a} : {result_a} :: {b} : ?",
            str(result_b), nearby_wrong(result_b),
            f"Rule: {rule}. {a}→{result_a}, {b}→{result_b}", 1))
    return questions


def gen_letter_analogy(conn, count=500):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Verbal Reasoning", "Analogy", "Letter analogy")
    if not qtid: return questions
    for _ in range(count):
        shift = random.randint(1, 5)
        a = random.randint(0, 20)
        pair1 = chr(65+a) + chr(65+a+shift)
        b = random.randint(0, 20)
        while b == a:
            b = random.randint(0, 20)
        pair2_q = chr(65+b)
        pair2_a = chr(65+b+shift)
        wrongs = [chr(65 + (b+shift+i) % 26) for i in [1, 2, -1]]
        questions.append(make_question(qtid,
            f"{pair1} : {pair2_q}?",
            pair2_q + pair2_a, [pair2_q + w for w in wrongs],
            f"Shift by {shift}: {pair1} → {pair2_q}{pair2_a}", 1))
    return questions


def gen_classification(conn, count=1500):
    questions = []
    # Number classification (odd one out)
    qtid = get_qtid(conn, SUBJECT, "Verbal Reasoning", "Classification", "Number classification")
    if qtid:
        for _ in range(count // 2):
            t = random.choice(["even", "odd", "prime", "square"])
            if t == "even":
                group = [random.randint(1, 50) * 2 for _ in range(3)]
                odd_one = random.randint(1, 50) * 2 + 1
            elif t == "odd":
                group = [random.randint(0, 49) * 2 + 1 for _ in range(3)]
                odd_one = random.randint(1, 50) * 2
            elif t == "prime":
                primes = [2,3,5,7,11,13,17,19,23,29,31,37,41,43,47]
                group = random.sample(primes, 3)
                odd_one = random.choice([4,6,8,9,10,12,14,15,16,18,20,21,22])
            else:
                squares = [1,4,9,16,25,36,49,64,81,100]
                group = random.sample(squares, 3)
                odd_one = random.choice([2,3,5,6,7,8,10,11,12,13,14,15])
            all_opts = group + [odd_one]
            random.shuffle(all_opts)
            questions.append(make_question(qtid,
                f"Find the odd one out: {', '.join(map(str, all_opts))}",
                str(odd_one), [str(x) for x in group],
                f"{odd_one} is not {t}", 1))

    qtid = get_qtid(conn, SUBJECT, "Verbal Reasoning", "Classification", "Word classification")
    if qtid:
        word_groups = [
            (["Apple", "Mango", "Banana", "Orange"], "Carrot", "Fruits"),
            (["Dog", "Cat", "Lion", "Tiger"], "Eagle", "Mammals"),
            (["Red", "Blue", "Green", "Yellow"], "Square", "Colors"),
            (["Delhi", "Mumbai", "Chennai", "Kolkata"], "India", "Cities"),
            (["Pen", "Pencil", "Marker", "Crayon"], "Book", "Writing tools"),
            (["Piano", "Guitar", "Violin", "Flute"], "Painting", "Instruments"),
            (["January", "March", "May", "July"], "Monday", "Months"),
            (["Mercury", "Venus", "Mars", "Jupiter"], "Moon", "Planets"),
            (["Nile", "Amazon", "Ganges", "Thames"], "Sahara", "Rivers"),
            (["Football", "Cricket", "Tennis", "Hockey"], "Chess", "Outdoor sports"),
        ]
        for _ in range(count // 2):
            group, odd, reason = random.choice(word_groups)
            display = random.sample(group[:3], 3) + [odd]
            random.shuffle(display)
            questions.append(make_question(qtid,
                f"Find the odd one out: {', '.join(display)}",
                odd, [x for x in display if x != odd][:3],
                f"{odd} is not in the category: {reason}", 1))
    return questions


def gen_number_series(conn, count=1500):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Verbal Reasoning", "Number Series", "Find next number")
    if not qtid: return questions
    for _ in range(count):
        series_type = random.choice(["add", "multiply", "square", "alternate", "diff"])
        if series_type == "add":
            start = random.randint(1, 50)
            d = random.randint(2, 15)
            series = [start + i * d for i in range(5)]
            ans = start + 5 * d
        elif series_type == "multiply":
            start = random.randint(1, 5)
            r = random.choice([2, 3])
            series = [start * (r ** i) for i in range(5)]
            ans = start * (r ** 5)
        elif series_type == "square":
            start = random.randint(1, 8)
            series = [(start + i) ** 2 for i in range(5)]
            ans = (start + 5) ** 2
        elif series_type == "alternate":
            a, b = random.randint(1, 10), random.randint(1, 10)
            series = []
            for i in range(5):
                series.append(series[-1] + a if i % 2 == 0 else series[-1] + b) if series else series.append(random.randint(1, 20))
                if i == 0:
                    continue
                if i % 2 == 1:
                    series[-1] = series[-2] + a
                else:
                    series[-1] = series[-2] + b
            ans = series[-1] + (a if len(series) % 2 == 1 else b)
        else:  # increasing difference
            start = random.randint(1, 10)
            series = [start]
            d = random.randint(1, 5)
            for i in range(4):
                series.append(series[-1] + d + i)
            ans = series[-1] + d + 4

        series_str = ", ".join(map(str, series))
        questions.append(make_question(qtid,
            f"Find the next number in the series: {series_str}, ?",
            str(ans), nearby_wrong(ans),
            f"Pattern: {series_type}. Next = {ans}", random.choice([1, 2])))
    return questions


def gen_coding_decoding(conn, count=2000):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Verbal Reasoning", "Coding-Decoding", "Letter coding")
    if qtid:
        for _ in range(count // 2):
            shift = random.randint(1, 5)
            word = random.choice(["COME", "GONE", "HELP", "LOVE", "MIND", "PLAY", "ROSE", "SING", "TALK", "WIND",
                                  "BACK", "DEEP", "FAST", "GIRL", "HOME", "JUST", "KING", "LAMP", "NAME", "OPEN"])
            coded = "".join(chr((ord(c) - 65 + shift) % 26 + 65) for c in word)
            word2 = random.choice(["BALL", "CAKE", "DARK", "EASY", "FISH", "GOOD", "HAND", "IDOL", "JOKE", "KEEP"])
            coded2 = "".join(chr((ord(c) - 65 + shift) % 26 + 65) for c in word2)
            wrongs = []
            for s in [shift+1, shift-1, shift+2]:
                wrongs.append("".join(chr((ord(c) - 65 + s) % 26 + 65) for c in word2))
            questions.append(make_question(qtid,
                f"If {word} is coded as {coded}, then {word2} is coded as?",
                coded2, wrongs,
                f"Each letter shifted by +{shift}", 2))

    qtid = get_qtid(conn, SUBJECT, "Verbal Reasoning", "Coding-Decoding", "Number coding")
    if qtid:
        for _ in range(count // 2):
            word = random.choice(["CAT", "DOG", "SUN", "PEN", "CUP", "BOX", "HAT", "MAP", "JAR", "FAN"])
            code = [random.randint(1, 9) for _ in word]
            code_str = "".join(map(str, code))
            word2 = random.choice(["BAT", "LOG", "RUN", "HEN", "BUS", "FOX", "RAT", "TAP"])
            # Same position mapping
            mapping = {c: str(v) for c, v in zip(word, code)}
            code2 = "".join(mapping.get(c, str(random.randint(1, 9))) for c in word2)
            wrongs = [str(int(code2) + i) for i in [11, -22, 33]]
            questions.append(make_question(qtid,
                f"If {word} = {code_str}, then {word2} = ?",
                code2, wrongs,
                f"Letter-to-number mapping from {word}={code_str}", 2))
    return questions


# ============ LOGICAL REASONING ============

def gen_blood_relations(conn, count=1500):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Logical Reasoning", "Blood Relations", "Direct relation")
    if not qtid: return questions

    templates = [
        ("A is the father of B. B is the sister of C. What is A to C?", "Father", ["Uncle", "Brother", "Grandfather"]),
        ("A is the mother of B. B is the brother of C. What is A to C?", "Mother", ["Aunt", "Sister", "Grandmother"]),
        ("A is the brother of B. B is the son of C. What is A to C?", "Son", ["Nephew", "Brother", "Father"]),
        ("A is the sister of B. B is the daughter of C. What is A to C?", "Daughter", ["Niece", "Sister", "Mother"]),
        ("A is the husband of B. B is the mother of C. What is A to C?", "Father", ["Uncle", "Brother", "Grandfather"]),
        ("A is the wife of B. B is the father of C. What is A to C?", "Mother", ["Aunt", "Sister", "Grandmother"]),
        ("A's father is B's son. What is B to A?", "Grandfather", ["Father", "Uncle", "Brother"]),
        ("A's mother is B's daughter. What is B to A?", "Grandmother", ["Mother", "Aunt", "Sister"]),
        ("A is B's brother's wife. What is A to B?", "Sister-in-law", ["Sister", "Cousin", "Aunt"]),
        ("A is B's father's brother. What is A to B?", "Uncle", ["Father", "Cousin", "Grandfather"]),
    ]

    for _ in range(count):
        q_text, correct, wrongs = random.choice(templates)
        names = random.sample(["P", "Q", "R", "S", "T", "M", "N", "X", "Y", "Z"], 3)
        q_text = q_text.replace("A", names[0]).replace("B", names[1]).replace("C", names[2])
        questions.append(make_question(qtid, q_text, correct, wrongs,
            f"Following family relationships, the answer is {correct}", 2))
    return questions


def gen_direction(conn, count=1200):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Logical Reasoning", "Direction and Distance", "Find final direction")
    if not qtid: return questions

    directions = ["North", "South", "East", "West"]
    turns = {"North": {"right": "East", "left": "West"},
             "South": {"right": "West", "left": "East"},
             "East": {"right": "South", "left": "North"},
             "West": {"right": "North", "left": "South"}}

    for _ in range(count):
        start = random.choice(directions)
        num_turns = random.randint(1, 3)
        current = start
        steps_desc = [f"starts facing {start}"]
        for _ in range(num_turns):
            turn = random.choice(["right", "left"])
            current = turns[current][turn]
            steps_desc.append(f"turns {turn}")
        wrong_dirs = [d for d in directions if d != current]
        questions.append(make_question(qtid,
            f"A person {', '.join(steps_desc)}. Which direction is the person facing now?",
            current, wrong_dirs[:3],
            f"After turns: {current}", 1))
    return questions


def gen_ranking(conn, count=1200):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Logical Reasoning", "Order and Ranking", "Find rank from top/bottom")
    if not qtid: return questions
    for _ in range(count):
        total = random.randint(20, 60)
        from_top = random.randint(1, total)
        from_bottom = total - from_top + 1
        ask = random.choice(["top", "bottom"])
        if ask == "top":
            questions.append(make_question(qtid,
                f"In a row of {total} students, a student is {from_bottom}th from the bottom. What is the student's position from the top?",
                str(from_top), nearby_wrong(from_top),
                f"From top = Total - From bottom + 1 = {total} - {from_bottom} + 1 = {from_top}", 1))
        else:
            questions.append(make_question(qtid,
                f"In a row of {total} students, a student is {from_top}th from the top. What is the student's position from the bottom?",
                str(from_bottom), nearby_wrong(from_bottom),
                f"From bottom = Total - From top + 1 = {total} - {from_top} + 1 = {from_bottom}", 1))
    return questions


def gen_syllogism(conn, count=1000):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Logical Reasoning", "Syllogism", "All/Some/No conclusions")
    if not qtid: return questions

    templates = [
        ("All A are B. All B are C.", "All A are C", ["No A is C", "Some A are not C", "All C are A"]),
        ("All A are B. Some B are C.", "Some A may be C", ["All A are C", "No A is C", "All C are A"]),
        ("No A is B. All B are C.", "Some C are not A", ["All A are C", "No C is A", "All C are A"]),
        ("Some A are B. All B are C.", "Some A are C", ["All A are C", "No A is C", "All C are A"]),
        ("All A are B. No B is C.", "No A is C", ["Some A are C", "All A are C", "All C are A"]),
    ]
    categories = ["dogs", "cats", "birds", "students", "teachers", "doctors", "players", "singers",
                   "dancers", "painters", "writers", "engineers", "lawyers", "flowers", "trees"]

    for _ in range(count):
        template, correct, wrongs = random.choice(templates)
        cats = random.sample(categories, 3)
        stmt = template.replace("A", cats[0].title()).replace("B", cats[1].title()).replace("C", cats[2].title())
        ans = correct.replace("A", cats[0].title()).replace("B", cats[1].title()).replace("C", cats[2].title())
        wrong_list = [w.replace("A", cats[0].title()).replace("B", cats[1].title()).replace("C", cats[2].title()) for w in wrongs]
        questions.append(make_question(qtid,
            f"Statements: {stmt}\nConclusion: Which follows?",
            ans, wrong_list, f"Based on Venn diagram logic", 2))
    return questions


# ============ NON-VERBAL REASONING ============

def gen_mirror_image(conn, count=800):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Non-Verbal Reasoning", "Mirror and Water Image", "Mirror image of text/numbers")
    if not qtid: return questions
    for _ in range(count):
        num = random.randint(100, 9999)
        mirror = str(num)[::-1]
        wrongs = [str(num + random.randint(1, 100)) for _ in range(3)]
        questions.append(make_question(qtid,
            f"What is the mirror image of the number {num} when a mirror is placed on the right side?",
            mirror, wrongs,
            f"Mirror reverses left-right: {num} → {mirror}", 1))
    return questions


def gen_dice(conn, count=800):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Non-Verbal Reasoning", "Dice and Cube", "Opposite face of dice")
    if not qtid: return questions
    for _ in range(count):
        faces = list(range(1, 7))
        # Standard dice: opposite faces sum to 7
        num = random.randint(1, 6)
        opp = 7 - num
        questions.append(make_question(qtid,
            f"On a standard die, what number is opposite to {num}?",
            str(opp), [str(x) for x in range(1, 7) if x != num and x != opp][:3],
            f"On a standard die, opposite faces sum to 7: {num} + {opp} = 7", 1))
    return questions


def gen_cube_painting(conn, count=600):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Non-Verbal Reasoning", "Dice and Cube", "Painted cube counting")
    if not qtid: return questions
    for _ in range(count):
        n = random.randint(2, 6)
        total = n ** 3
        three_face = 8  # corners
        two_face = (n - 2) * 12 if n > 2 else 0
        one_face = (n - 2) ** 2 * 6 if n > 2 else 0
        no_face = (n - 2) ** 3 if n > 2 else 0
        ask = random.choice(["three", "two", "one", "no"])
        ans_map = {"three": three_face, "two": two_face, "one": one_face, "no": no_face}
        ans = ans_map[ask]
        questions.append(make_question(qtid,
            f"A cube of side {n} is painted on all faces and then cut into {total} unit cubes. How many cubes have {ask} face(s) painted?",
            str(ans), nearby_wrong(ans),
            f"For {n}×{n}×{n} cube: {ask} faces painted = {ans}", 2))
    return questions


# ============ MATHEMATICAL REASONING ============

def gen_math_operations(conn, count=1000):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Mathematical Reasoning", "Mathematical Operations", "Symbol substitution")
    if not qtid: return questions
    ops = {'+': lambda a, b: a + b, '-': lambda a, b: a - b,
           '×': lambda a, b: a * b, '÷': lambda a, b: a // b}
    symbols = ['@', '#', '$', '&', '*', '!']
    for _ in range(count):
        op_pairs = random.sample(list(ops.keys()), 2)
        sym_pairs = random.sample(symbols, 2)
        a, b, c = random.randint(2, 20), random.randint(2, 20), random.randint(2, 20)
        mapping_text = f"{sym_pairs[0]} means '{op_pairs[0]}' and {sym_pairs[1]} means '{op_pairs[1]}'"
        expr_text = f"{a} {sym_pairs[0]} {b} {sym_pairs[1]} {c}"
        result = ops[op_pairs[1]](ops[op_pairs[0]](a, b), c)
        questions.append(make_question(qtid,
            f"If {mapping_text}, find: {expr_text}",
            str(result), nearby_wrong(result),
            f"Replace symbols: {a} {op_pairs[0]} {b} {op_pairs[1]} {c} = {result}", 2))
    return questions


def gen_number_puzzles(conn, count=800):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Mathematical Reasoning", "Number Puzzles", "Find missing number in grid")
    if not qtid: return questions
    for _ in range(count):
        # 3x3 grid where rows/cols sum to same value
        a, b = random.randint(1, 20), random.randint(1, 20)
        c = a + b
        d = random.randint(1, 20)
        e = c - d + random.randint(1, 10)
        missing = a + d - e + b  # Some pattern
        # Simpler: row sums are equal
        r1 = [random.randint(1, 20) for _ in range(3)]
        target = sum(r1)
        r2_a, r2_b = random.randint(1, 15), random.randint(1, 15)
        r2_c = target - r2_a - r2_b
        if r2_c > 0:
            questions.append(make_question(qtid,
                f"In a grid, row 1 is [{r1[0]}, {r1[1]}, {r1[2]}] (sum={target}). Row 2 is [{r2_a}, {r2_b}, ?]. Find the missing number if row sums are equal.",
                str(r2_c), nearby_wrong(r2_c),
                f"? = {target} - {r2_a} - {r2_b} = {r2_c}", 1))
    return questions


def gen_venn_diagram(conn, count=800):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Mathematical Reasoning", "Venn Diagram", "Count elements in region")
    if not qtid: return questions
    for _ in range(count):
        total_a = random.randint(20, 100)
        total_b = random.randint(20, 100)
        both = random.randint(5, min(total_a, total_b))
        only_a = total_a - both
        only_b = total_b - both
        questions.append(make_question(qtid,
            f"In a group, {total_a} like tea, {total_b} like coffee, and {both} like both. How many like only tea?",
            str(only_a), nearby_wrong(only_a),
            f"Only tea = {total_a} - {both} = {only_a}", 1))
    return questions


# ============ CRITICAL THINKING ============

def gen_statement_conclusion(conn, count=800):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Critical Thinking", "Statement and Conclusion", "Which conclusion follows")
    if not qtid: return questions
    templates = [
        ("All students who study hard pass the exam.", "Some who pass studied hard", ["No one studies hard", "Everyone fails", "Studying is not needed"]),
        ("Regular exercise improves health.", "People who exercise are healthier", ["Exercise is harmful", "Health has no relation to exercise", "Only medicine improves health"]),
        ("Reading improves vocabulary.", "People who read more have better vocabulary", ["Reading is useless", "Vocabulary cannot be improved", "TV improves vocabulary more"]),
        ("Smoking causes cancer.", "Smokers are at higher risk of cancer", ["All smokers get cancer", "Cancer has no cause", "Smoking is healthy"]),
        ("Water pollution affects marine life.", "Marine life is harmed by water pollution", ["Marine life thrives in pollution", "Pollution has no effect", "Only air pollution matters"]),
    ]
    for _ in range(count):
        stmt, correct, wrongs = random.choice(templates)
        questions.append(make_question(qtid,
            f"Statement: {stmt}\nWhich conclusion logically follows?",
            correct, wrongs, f"Direct logical inference", 2))
    return questions


def gen_letter_series(conn, count=1000):
    questions = []
    qtid = get_qtid(conn, SUBJECT, "Verbal Reasoning", "Letter Series", "Find next letters")
    if not qtid: return questions
    for _ in range(count):
        start = random.randint(0, 15)
        skip = random.randint(1, 4)
        series = [chr(65 + start + i * skip) for i in range(4) if start + i * skip < 26]
        if len(series) < 4:
            continue
        nxt_idx = start + 4 * skip
        if nxt_idx < 26:
            ans = chr(65 + nxt_idx)
            wrongs = [chr(65 + (nxt_idx + i) % 26) for i in [1, 2, -1]]
            questions.append(make_question(qtid,
                f"Find the next letter: {', '.join(series)}, ?",
                ans, wrongs,
                f"Skip {skip}: next = {ans}", 1))
    return questions


def generate_all(conn):
    """Generate all Reasoning questions."""
    generators = [
        ("Number Analogy", gen_number_analogy, 700),
        ("Letter Analogy", gen_letter_analogy, 500),
        ("Classification", gen_classification, 2000),
        ("Number Series", gen_number_series, 2000),
        ("Letter Series", gen_letter_series, 1500),
        ("Coding-Decoding", gen_coding_decoding, 2500),
        ("Blood Relations", gen_blood_relations, 2000),
        ("Direction & Distance", gen_direction, 1500),
        ("Order & Ranking", gen_ranking, 1500),
        ("Syllogism", gen_syllogism, 1500),
        ("Mirror Image", gen_mirror_image, 1000),
        ("Dice", gen_dice, 1000),
        ("Cube Painting", gen_cube_painting, 800),
        ("Math Operations", gen_math_operations, 1500),
        ("Number Puzzles", gen_number_puzzles, 1200),
        ("Venn Diagram", gen_venn_diagram, 1200),
        ("Statement & Conclusion", gen_statement_conclusion, 1200),
    ]

    total = 0
    all_questions = []
    for name, gen_func, count in generators:
        questions = gen_func(conn, count)
        all_questions.extend(questions)
        print(f"  {name}: {len(questions)} questions")
        total += len(questions)

    batch_size = 5000
    for i in range(0, len(all_questions), batch_size):
        insert_questions_batch(conn, all_questions[i:i+batch_size])

    print(f"  TOTAL Reasoning: {total}")
    return total


if __name__ == '__main__':
    conn = get_db()
    print("Generating Reasoning questions...")
    generate_all(conn)
    conn.close()