°斗破£笙 2020-03-03 22:31 采纳率: 0%
浏览 577

求解c++爬虫,这个代码为何老是(hash_set)错误?

老是编译错误好难呀,越改越错只好让大佬们帮忙看看了!用的是vs2019

报错

    E0035   #error 指令:  <hash_set> is deprecated and will be REMOVED. Please use <unordered_set>. You can define _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS to acknowledge that you have received this warning.  c爬虫 C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.24.28314\include\hash_set 21  
错误  C1189   #error:  <hash_set> is deprecated and will be REMOVED. Please use <unordered_set>. You can define _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS to acknowledge that you have received this warning. c爬虫 C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.24.28314\include\hash_set 22  

设计爬虫程序并将爬取到内容存放在文件中。
要求:
1. 可将指定URL及其级联的三级网页下面的所有URL及相应内容找出来。
2. 以BFS或者DFS策略搜索级联页面,设计相应的数据结构与文件,将URL与页面内容(文本形式)保存。
3. 提供简单的搜索引擎功能(给定字符串,如果在级联的三级网页下面下存在,则返回相应的URL)



#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include "winsock2.h"
#include <time.h>
#include <queue>
#include <hash_set>
#pragma comment(lib, "ws2_32.lib") 
using namespace std;
#define DEFAULT_PAGE_BUF_SIZE 1048576

queue<string> hrefUrl;
hash_set<string> visitedUrl;
hash_set<string> visitedImg;
int g_ImgCnt = 1;//图片编号

bool ParseURL(const string& url, string& host, string& resource) {//解析URL,解析出主机名,资源名
    if (strlen(url.c_str()) > 2000) {
        return false;
    }
    const char* pos = strstr(url.c_str(), "http://");
    if (pos == NULL) pos = url.c_str();
    else pos += strlen("http://");
    char pHost[100];
    char pResource[2000];
    if (strstr(pos, "/") == 0)
    {
        pResource[0] = '/'; pResource[1] = 0;
        strcpy(pHost, pos);
    }
    else
        sscanf(pos, "%[^/]%s", pHost, pResource);

    host = pHost;
    resource = pResource;
    return true;
}
//使用Get请求,得到响应
bool GetHttpResponse(const string& url, char*& response, int& bytesRead) {
    string host, resource;
    if (!ParseURL(url, host, resource)) {
        cout << "获取get请求失败" << endl;
        return false;
    }

    //建立socket
    struct hostent* hp = gethostbyname(host.c_str());
    if (hp == NULL) {
        cout << " host 地址错误" << endl;
        return false;
    }

    SOCKET sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
    if (sock == -1 || sock == -2) {
        cout << " sock创建失败" << endl;
        return false;
    }
    //建立服务器地址
    SOCKADDR_IN sa;
    sa.sin_family = AF_INET;
    sa.sin_port = htons(80);
    memcpy(&sa.sin_addr, hp->h_addr, 4);
    //建立连接
    if (0 != connect(sock, (SOCKADDR*)&sa, sizeof(sa))) {
        cout << "不能连接: " << url << endl;
        closesocket(sock);
        return false;
    };
    //准备发送数据
    string request = "GET " + resource + " HTTP/1.1\r\nHost:" + host + "\r\nConnection:Close\r\n\r\n";//字符串拼接
    //发送数据
    if (SOCKET_ERROR == send(sock, request.c_str(), request.size(), 0)) {//request.size()=strlen(request.c_str)
        cout << "send error" << endl;
        closesocket(sock);
        return false;
    }
    //接收数据
    int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
    char* pageBuf = (char*)malloc(m_nContentLength);
    memset(pageBuf, 0, m_nContentLength);
    bytesRead = 0;
    int ret = 1;
    while (ret > 0)
    {
        ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);
        if (ret > 0){
            bytesRead += ret;
        }
        if (m_nContentLength - bytesRead < 100){
            m_nContentLength *= 2;
            pageBuf = (char*)realloc(pageBuf, m_nContentLength);
        }
        cout << ret << " ";
    }
    cout << endl;
    pageBuf[bytesRead] = '\0';
    response = pageBuf;
    closesocket(sock);
    return true;
}
//提取所有的URL以及图片URL
void HTMLParse(string& htmlResponse, vector<string>& imgurls, const string& host) {
    const char* p = htmlResponse.c_str();
    const char* tag = "href=\"";//href的内容 就是指要跳转的路由 或 方法 
    const char* pos = strstr(p, tag);
    ofstream ofile("url.txt", ios::app);//以追加的方式打开文件
    while (pos){
        pos += strlen(tag);
        const char* nextQ = strstr(pos, "\"");
        if (nextQ){
            char* url = new char[nextQ - pos + 1];
            sscanf(pos, "%[^\"]", url);
            string surl = url;  // 转换成string类型,可以自动释放内存
            if (visitedUrl.find(surl) == visitedUrl.end()) {//   visitedUrl.find( surl )=visitedUrl.end() 表示visitedUrl中没有sur1
                visitedUrl.insert(surl);
                ofile << surl << endl;
                hrefUrl.push(surl);//将sur1插到队列末
            }
            pos = strstr(pos, tag);
            delete[] url; 
        }
    }
    ofile << endl << endl;
    ofile.close();
    tag = "<img ";
    const char* att1 = "src=\"";//规定图像的 URL。
    const char* att2 = "lazy_src=\"";
    const char* pos0 = strstr(p, tag);
    while (pos0){
        pos0 += strlen(tag);
        const char* pos2 = strstr(pos0, att2);
        if (!pos2 || pos2 > strstr(pos0, ">")){
            pos = strstr(pos0, att1);//非延迟加载项
            if (!pos)
            {
                pos0 = strstr(att1, tag);
                continue;
            }
            else
            {
                pos = pos + strlen(att1);
            }
        }else {
            pos = pos2 + strlen(att2);
        }

        const char* nextQ = strstr(pos, "\"");
        if (nextQ){
            char* url = new char[nextQ - pos + 1];
            sscanf(pos, "%[^\"]", url);
            cout << url << endl;
            string imgUrl = url;
            if (visitedImg.find(imgUrl) == visitedImg.end())
            {
                visitedImg.insert(imgUrl);
                imgurls.push_back(imgUrl);//imgurls作为模板队列,push_back的参数由单个字符变成string类型
            }
            pos0 = strstr(pos0, tag);
            delete[] url;
        }
    }
    cout << "网页请求成功!" << endl;
}

//把URL转化为文件名
string ToFileName(const string& url) {
    string fileName;
    fileName.resize(url.size());
    int k = 0;
    for (int i = 0; i < (int)url.size(); i++) {
        char ch = url[i];
        if (ch != '\\' && ch != '/' && ch != ':' && ch != '*' && ch != '?' && ch != '"' && ch != '<' && ch != '>' && ch != '|')
            fileName[k++] = ch;
    }
    return fileName.substr(0, k) + ".txt";//从第0位开始,长度为k的字符串 + .txt
}

//下载图片到img文件夹
void DownLoadImg(vector<string>& imgurls, const string& url) {

    //生成保存该url下图片的文件夹
    string foldname = ToFileName(url);
    foldname = "./img/" + foldname;
    if (!CreateDirectoryA(foldname.c_str(), NULL))
        cout << "不能创建文件:" << foldname << endl;
    char* image;
    int byteRead;
    for (int i = 0; i < imgurls.size(); i++)
    {
        //判断是否为图片
        string str = imgurls[i];
        int pos = str.find_last_of(".");
        if (pos == string::npos)
            continue;
        else
        {
            string ext = str.substr(pos + 1, str.size() - pos - 1);
            if (ext != "bmp" && ext != "jpg" && ext != "jpeg" && ext != "gif" && ext != "png")
                continue;
        }
        //下载其中的内容
        if (GetHttpResponse(imgurls[i], image, byteRead))
        {
            if (strlen(image) == 0)
            {
                continue;
            }
            const char* p = image;
            const char* pos = strstr(p, "\r\n\r\n") + strlen("\r\n\r\n");
            int index = imgurls[i].find_last_of("/");
            if (index != string::npos)
            {
                string imgname = imgurls[i].substr(index, imgurls[i].size());
                ofstream ofile(foldname + imgname, ios::binary);
                if (!ofile.is_open())
                    continue;
                cout << g_ImgCnt++ << foldname + imgname << endl;
                ofile.write(pos, byteRead - (pos - p));
                ofile.close();
            }
            free(image);
        }
    }
}

//广度遍历
void BFS(const string& url) {
    char* response;
    int bytes;
    // 获取网页的相应,放入response中。
    if (!GetHttpResponse(url, response, bytes)) {
        cout << "网页获取错误!." << endl;
        return;
    }
    string httpResponse = response;
    free(response);
    string filename = ToFileName(url);
    ofstream ofile("./html/" + filename);
    if (ofile.is_open()) {
        ofile << httpResponse << endl; // 保存该网页的文本内容
        ofile.close();
    }
    vector<string> imgurls;
    HTMLParse(httpResponse, imgurls, url);//解析该网页的所有图片链接,放入imgurls里面
    DownLoadImg(imgurls, url);
}
//文件内容搜索 
void Fstring(string& urlar) {
    int i = 0;
    cout << "请输入需要查找的字符串";
    string defo, urltxt;
    cin >> defo;
    ifstream ifile;
    do {
        if (urlar[i] == NULL) {
            cout << "没有找到所要内容";
            ifile.close();
            return;
        }
        ifstream ifile("./html/" + urlar[i++]);
        ifile >> urltxt;
    } while (urltxt.find(defo) == string::npos);//字符串对比 
    cout << "搜索内容位于:" << urlar[i - 1];
    ifile.close();
}

int main(){
    //初始化socket,用于tcp网络连接
    WSADATA wsaData;
    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
        return 0;
    }
    // 创建文件夹,保存图片和网页文本文件
    CreateDirectoryA("./img", 0);
    CreateDirectoryA("./html", 0);
    string urlStart = "http://hao.360.cn/meinvdaohang.html";
    //cout << "请输入url";
    //cin >> urlStart;
    BFS(urlStart);
    string urlar[2000];
    int i = 0;
    // 访问过的网址保存起来
    visitedUrl.insert(urlStart);
    while (hrefUrl.size() != 0) {
        string url = hrefUrl.front();  // 从队列的最开始取出一个网址
        cout << url << endl;
        BFS(url);                      // 遍历提取出来的那个网页,找它里面的超链接网页放入hrefUrl,下载它里面的文本,图片
        urlar[i++] = ToFileName(url);
        hrefUrl.pop();                 // 遍历完之后,删除这个网址
    }
    int p;
    cout << "是否需要查找内容?(1,查找,2退出):";
    cin >> p;
    if (p == 1)
        Fstring(*urlar);
    WSACleanup();//解除socket

    return 0;
}
#include <string>
#include <iostream>
#include <fstream>
#include <vector>
#include "winsock2.h"
#include <time.h>
#include <queue>
#include <hash_set>
#pragma comment(lib, "ws2_32.lib") 
using namespace std;
#define DEFAULT_PAGE_BUF_SIZE 1048576

queue<string> hrefUrl;
hash_set<string> visitedUrl;
hash_set<string> visitedImg;
int g_ImgCnt = 1;//图片编号

bool ParseURL(const string& url, string& host, string& resource) {//解析URL,解析出主机名,资源名
    if (strlen(url.c_str()) > 2000) {
        return false;
    }
    const char* pos = strstr(url.c_str(), "http://");
    if (pos == NULL) pos = url.c_str();
    else pos += strlen("http://");
    char pHost[100];
    char pResource[2000];
    if (strstr(pos, "/") == 0)
    {
        pResource[0] = '/'; pResource[1] = 0;
        strcpy(pHost, pos);
    }
    else
        sscanf(pos, "%[^/]%s", pHost, pResource);

    host = pHost;
    resource = pResource;
    return true;
}
//使用Get请求,得到响应
bool GetHttpResponse(const string& url, char*& response, int& bytesRead) {
    string host, resource;
    if (!ParseURL(url, host, resource)) {
        cout << "获取get请求失败" << endl;
        return false;
    }

    //建立socket
    struct hostent* hp = gethostbyname(host.c_str());
    if (hp == NULL) {
        cout << " host 地址错误" << endl;
        return false;
    }

    SOCKET sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
    if (sock == -1 || sock == -2) {
        cout << " sock创建失败" << endl;
        return false;
    }
    //建立服务器地址
    SOCKADDR_IN sa;
    sa.sin_family = AF_INET;
    sa.sin_port = htons(80);
    memcpy(&sa.sin_addr, hp->h_addr, 4);
    //建立连接
    if (0 != connect(sock, (SOCKADDR*)&sa, sizeof(sa))) {
        cout << "不能连接: " << url << endl;
        closesocket(sock);
        return false;
    };
    //准备发送数据
    string request = "GET " + resource + " HTTP/1.1\r\nHost:" + host + "\r\nConnection:Close\r\n\r\n";//字符串拼接
    //发送数据
    if (SOCKET_ERROR == send(sock, request.c_str(), request.size(), 0)) {//request.size()=strlen(request.c_str)
        cout << "send error" << endl;
        closesocket(sock);
        return false;
    }
    //接收数据
    int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
    char* pageBuf = (char*)malloc(m_nContentLength);
    memset(pageBuf, 0, m_nContentLength);
    bytesRead = 0;
    int ret = 1;
    while (ret > 0)
    {
        ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);
        if (ret > 0){
            bytesRead += ret;
        }
        if (m_nContentLength - bytesRead < 100){
            m_nContentLength *= 2;
            pageBuf = (char*)realloc(pageBuf, m_nContentLength);
        }
        cout << ret << " ";
    }
    cout << endl;
    pageBuf[bytesRead] = '\0';
    response = pageBuf;
    closesocket(sock);
    return true;
}
//提取所有的URL以及图片URL
void HTMLParse(string& htmlResponse, vector<string>& imgurls, const string& host) {
    const char* p = htmlResponse.c_str();
    const char* tag = "href=\"";//href的内容 就是指要跳转的路由 或 方法 
    const char* pos = strstr(p, tag);
    ofstream ofile("url.txt", ios::app);//以追加的方式打开文件
    while (pos){
        pos += strlen(tag);
        const char* nextQ = strstr(pos, "\"");
        if (nextQ){
            char* url = new char[nextQ - pos + 1];
            sscanf(pos, "%[^\"]", url);
            string surl = url;  // 转换成string类型,可以自动释放内存
            if (visitedUrl.find(surl) == visitedUrl.end()) {//   visitedUrl.find( surl )=visitedUrl.end() 表示visitedUrl中没有sur1
                visitedUrl.insert(surl);
                ofile << surl << endl;
                hrefUrl.push(surl);//将sur1插到队列末
            }
            pos = strstr(pos, tag);
            delete[] url; 
        }
    }
    ofile << endl << endl;
    ofile.close();
    tag = "<img ";
    const char* att1 = "src=\"";//规定图像的 URL。
    const char* att2 = "lazy_src=\"";
    const char* pos0 = strstr(p, tag);
    while (pos0){
        pos0 += strlen(tag);
        const char* pos2 = strstr(pos0, att2);
        if (!pos2 || pos2 > strstr(pos0, ">")){
            pos = strstr(pos0, att1);//非延迟加载项
            if (!pos)
            {
                pos0 = strstr(att1, tag);
                continue;
            }
            else
            {
                pos = pos + strlen(att1);
            }
        }else {
            pos = pos2 + strlen(att2);
        }

        const char* nextQ = strstr(pos, "\"");
        if (nextQ){
            char* url = new char[nextQ - pos + 1];
            sscanf(pos, "%[^\"]", url);
            cout << url << endl;
            string imgUrl = url;
            if (visitedImg.find(imgUrl) == visitedImg.end())
            {
                visitedImg.insert(imgUrl);
                imgurls.push_back(imgUrl);//imgurls作为模板队列,push_back的参数由单个字符变成string类型
            }
            pos0 = strstr(pos0, tag);
            delete[] url;
        }
    }
    cout << "网页请求成功!" << endl;
}

//把URL转化为文件名
string ToFileName(const string& url) {
    string fileName;
    fileName.resize(url.size());
    int k = 0;
    for (int i = 0; i < (int)url.size(); i++) {
        char ch = url[i];
        if (ch != '\\' && ch != '/' && ch != ':' && ch != '*' && ch != '?' && ch != '"' && ch != '<' && ch != '>' && ch != '|')
            fileName[k++] = ch;
    }
    return fileName.substr(0, k) + ".txt";//从第0位开始,长度为k的字符串 + .txt
}

//下载图片到img文件夹
void DownLoadImg(vector<string>& imgurls, const string& url) {

    //生成保存该url下图片的文件夹
    string foldname = ToFileName(url);
    foldname = "./img/" + foldname;
    if (!CreateDirectoryA(foldname.c_str(), NULL))
        cout << "不能创建文件:" << foldname << endl;
    char* image;
    int byteRead;
    for (int i = 0; i < imgurls.size(); i++)
    {
        //判断是否为图片
        string str = imgurls[i];
        int pos = str.find_last_of(".");
        if (pos == string::npos)
            continue;
        else
        {
            string ext = str.substr(pos + 1, str.size() - pos - 1);
            if (ext != "bmp" && ext != "jpg" && ext != "jpeg" && ext != "gif" && ext != "png")
                continue;
        }
        //下载其中的内容
        if (GetHttpResponse(imgurls[i], image, byteRead))
        {
            if (strlen(image) == 0)
            {
                continue;
            }
            const char* p = image;
            const char* pos = strstr(p, "\r\n\r\n") + strlen("\r\n\r\n");
            int index = imgurls[i].find_last_of("/");
            if (index != string::npos)
            {
                string imgname = imgurls[i].substr(index, imgurls[i].size());
                ofstream ofile(foldname + imgname, ios::binary);
                if (!ofile.is_open())
                    continue;
                cout << g_ImgCnt++ << foldname + imgname << endl;
                ofile.write(pos, byteRead - (pos - p));
                ofile.close();
            }
            free(image);
        }
    }
}

//广度遍历
void BFS(const string& url) {
    char* response;
    int bytes;
    // 获取网页的相应,放入response中。
    if (!GetHttpResponse(url, response, bytes)) {
        cout << "网页获取错误!." << endl;
        return;
    }
    string httpResponse = response;
    free(response);
    string filename = ToFileName(url);
    ofstream ofile("./html/" + filename);
    if (ofile.is_open()) {
        ofile << httpResponse << endl; // 保存该网页的文本内容
        ofile.close();
    }
    vector<string> imgurls;
    HTMLParse(httpResponse, imgurls, url);//解析该网页的所有图片链接,放入imgurls里面
    DownLoadImg(imgurls, url);
}
//文件内容搜索 
void Fstring(string& urlar) {
    int i = 0;
    cout << "请输入需要查找的字符串";
    string defo, urltxt;
    cin >> defo;
    ifstream ifile;
    do {
        if (urlar[i] == NULL) {
            cout << "没有找到所要内容";
            ifile.close();
            return;
        }
        ifstream ifile("./html/" + urlar[i++]);
        ifile >> urltxt;
    } while (urltxt.find(defo) == string::npos);//字符串对比 
    cout << "搜索内容位于:" << urlar[i - 1];
    ifile.close();
}

int main(){
    //初始化socket,用于tcp网络连接
    WSADATA wsaData;
    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
        return 0;
    }
    // 创建文件夹,保存图片和网页文本文件
    CreateDirectoryA("./img", 0);
    CreateDirectoryA("./html", 0);
    string urlStart = "http://hao.360.cn/meinvdaohang.html";
    //cout << "请输入url";
    //cin >> urlStart;
    BFS(urlStart);
    string urlar[2000];
    int i = 0;
    // 访问过的网址保存起来
    visitedUrl.insert(urlStart);
    while (hrefUrl.size() != 0) {
        string url = hrefUrl.front();  // 从队列的最开始取出一个网址
        cout << url << endl;
        BFS(url);                      // 遍历提取出来的那个网页,找它里面的超链接网页放入hrefUrl,下载它里面的文本,图片
        urlar[i++] = ToFileName(url);
        hrefUrl.pop();                 // 遍历完之后,删除这个网址
    }
    int p;
    cout << "是否需要查找内容?(1,查找,2退出):";
    cin >> p;
    if (p == 1)
        Fstring(*urlar);
    WSACleanup();//解除socket

    return 0;
}
  • 写回答

1条回答

  • CSDN-Ada助手 CSDN-AI 官方账号 2022-09-20 21:20
    关注
    不知道你这个问题是否已经解决, 如果还没有解决的话:

    如果你已经解决了该问题, 非常希望你能够分享一下解决方案, 写成博客, 将相关链接放在评论区, 以帮助更多的人 ^-^
    评论

报告相同问题?

悬赏问题

  • ¥60 版本过低apk如何修改可以兼容新的安卓系统
  • ¥25 由IPR导致的DRIVER_POWER_STATE_FAILURE蓝屏
  • ¥50 有数据,怎么建立模型求影响全要素生产率的因素
  • ¥50 有数据,怎么用matlab求全要素生产率
  • ¥15 TI的insta-spin例程
  • ¥15 完成下列问题完成下列问题
  • ¥15 C#算法问题, 不知道怎么处理这个数据的转换
  • ¥15 YoloV5 第三方库的版本对照问题
  • ¥15 请完成下列相关问题!
  • ¥15 drone 推送镜像时候 purge: true 推送完毕后没有删除对应的镜像,手动拷贝到服务器执行结果正确在样才能让指令自动执行成功删除对应镜像,如何解决?