ABOUT ME

-

Today
-
Yesterday
-
Total
-
  • 뚝딱 뚝딱
    취미 생활 ㅎ 2025. 2. 24. 19:51

    간만에 동네 친구가 뭔갈 부탁


    오 재밌겠다싶어 퇴근하고 바로 작업시작

    import time
    import pandas as pd
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.service import Service
    from selenium.webdriver.chrome.options import Options
    from webdriver_manager.chrome import ChromeDriverManager

    def parse_naver_blog(url):
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(url)
        time.sleep(3)
        data_list = []
        try:
            tables = driver.find_elements(By.CSS_SELECTOR, "table.se-table-content")
            for tbl in tables:
                rows = tbl.find_elements(By.CSS_SELECTOR, "tr.se-tr")
                for row in rows:
                    tds = row.find_elements(By.TAG_NAME, "td")
                    if len(tds) < 2:
                        continue
                    rank = tds[0].text.strip()
                    second_td = tds[1]
                    blog_company_name = second_td.text.strip()
                    jobkorea_link = ""
                    try:
                        a_tag = second_td.find_element(By.TAG_NAME, "a")
                        blog_company_name = a_tag.text.strip()
                        jobkorea_link = a_tag.get_attribute("href")
                    except:
                        pass
                    if "jobkorea.co.kr/company" not in jobkorea_link:
                        continue
                    data_list.append({
                        "rank": rank,
                        "blog_company_name": blog_company_name,
                        "jobkorea_link": jobkorea_link
                    })
        except Exception as e:
            print("네이버 블로그 파싱 에러:", e)
        finally:
            driver.quit()
        return data_list

    def parse_jobkorea_company(url):
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver.get(url)
        time.sleep(3)
        data = {
            "jobkorea_company_name": "회사이름읎음",
            "homepage": "폐업?",
            "address": "정보 읎음"
        }
        try:
            full_title = driver.title
            data["jobkorea_company_name"] = full_title.split(":")[0].strip()
            table = driver.find_element(By.CSS_SELECTOR, ".table-basic-infomation-primary")
            rows = table.find_elements(By.CSS_SELECTOR, "tr.field")
            for row in rows:
                label_elems = row.find_elements(By.CSS_SELECTOR, "th.field-label")
                value_elems = row.find_elements(By.CSS_SELECTOR, "td.field-value")
                for i in range(len(label_elems)):
                    label_text = label_elems[i].text.strip()
                    val_text = value_elems[i].text.strip()
                    if label_text == "홈페이지":
                        try:
                            a_tag = value_elems[i].find_element(By.TAG_NAME, "a")
                            val_text = a_tag.get_attribute("href")
                        except:
                            pass
                    if label_text == "홈페이지":
                        data["homepage"] = val_text
                    elif label_text.startswith("주소"):
                        data["address"] = val_text
                    else:
                        pass
        except Exception as e:
            print("잡코 파싱 오류:", e)
        finally:
            driver.quit()
        return data

    def collect_and_save_data(blog_urls, output_file="blog_jobkorea_result.xlsx"):
        all_rows = []
        for blog_url in blog_urls:
            print(f"[블로그파밍] {blog_url}")
            blog_data = parse_naver_blog(blog_url)
            for item in blog_data:
                rank = item["rank"]
                b_name = item["blog_company_name"]
                link = item["jobkorea_link"]
                print(f"  - 링크 파싱: {rank}, {b_name}, {link}")
                info = parse_jobkorea_company(link)
                row = {
                    "blog_url": blog_url,
                    "rank": rank,
                    "blog_company_name": b_name,
                    "jobkorea_link": link,
                    "jobkorea_company_name": info["jobkorea_company_name"],
                    "homepage": info["homepage"],
                    "address": info["address"]
                }
                all_rows.append(row)
        if all_rows:
            df = pd.DataFrame(all_rows)
            df.to_excel(output_file, index=False)
            print(f"[완료] 엑셀 저장: {output_file}, 총 {len(all_rows)}건")
        else:
            print("데이터 ㄴㄴㄴ.")

    if __name__ == "__main__":
        blog_urls = [
            "https://m.blog.naver.com/PostView.naver?blogId=esuccess&logNo=223193200932",
            # 다른 블로그 주소...
        ]
        collect_and_save_data(blog_urls, "1.xlsx")

    잘됨

    아 물런 손 좀 봐야하는데 귀찮

    챗지피티 만세임

    '취미 생활 ㅎ' 카테고리의 다른 글

    대전 차 vs 함양 별  (0) 2026.05.24
    전천카메라 테스트  (0) 2026.03.08
    동네 애기 선물  (0) 2024.11.29
    만들어보자  (0) 2024.10.07
    지름신  (0) 2024.07.06