一、博客链接和Github链接

郑孔宇 俞凯欣
github项目地址

二、具体分工:

基本功能部分:郑孔宇
爬虫工具、测试及附加题部分:俞凯欣

三、PSP表格:

PSP2.1Personal Software Process Stages预估耗时(分钟)实际耗时(分钟)
Planning计划1010
· Estimate· 估计这个任务需要多少时间540620
Development开发00
· Analysis· 需求分析 (包括学习新技术)12060
· Design Spec· 生成设计文档2020
· Design Review· 设计复审2020
· Coding Standard· 代码规范 (为目前的开发制定合适的规范)00
· Design· 具体设计2020
· Coding· 具体编码220360
· Code Review· 代码复审4040
· Test· 测试(自我测试,修改代码,提交修改)6060
Reporting报告00
· Test Repor· 测试报告00
· Size Measurement· 计算工作量00
· Postmortem & Process Improvement Plan· 事后总结, 并提出过程改进计划303

四、爬虫工具

爬虫工具使用的八爪鱼,通过点击需要爬取的信息,来获取相同类型的信息进行选定操作,然后制作流程图来完成爬虫,爬取完毕后再进行导出操作即可。

1474284-20181102131458958-438823308.png
1474284-20181102131449241-1302980055.png

五、需求分析

1474284-20181010152330310-290836455.png

六、功能实现

1.主函数
  • 获取命令行中的所有指令,并执行相关函数
int main(int args, char* argv[])
{
    char* a = NULL;
    char* b = NULL;
    char* c = NULL;
    char* d = NULL;
    char* e = NULL;
    string cstr, dstr, estr;
    int i;
    int w = 0, m = 0, n = 0;
    for (i = 0; i < args; i++)
    {
        if (strcmp(argv[i], "-i") == 0)  //检测-i指令
        {
            a = argv[i + 1];
        }
        if (strcmp(argv[i], "-o") == 0) //检测-o指令
        {
            b = argv[i + 1];
        }
        if (strcmp(argv[i], "-w") == 0) //检测-w指令
        {
            c = argv[i + 1];
            cstr = c;
            w = atoi(cstr.c_str());
        }
        if (strcmp(argv[i], "-n") == 0) //检测-n指令
        {
            d = argv[i + 1];
            dstr = d;
            n = atoi(dstr.c_str()); 
        }
        if (strcmp(argv[i], "-m") == 0) //检测-m指令
        {
            e = argv[i + 1];
            estr = e;
            m = atoi(estr.c_str());
        }
    }
    readtxt(a); //读取文件并获取所有字符数
    divide_n(w); //获取行数 单词数 排除Title: Abstract: 和编号后的字符数,并分割、存入和排序单词
    if (m == 0)
    {       
        writetxt_n(b, n); //输出无-m时候的格式
    }
    else
    {
        readtxt2(a); //读取文件不改变字符数,用于重新分割
        divide_m(w, m); //分割单词若满足词组条件则存入并排序
        writetxt_m(b); //输出有-m时候的格式
    }
}
2.词频统计(divide_n) (writetxt_n)

1474284-20181010184914331-530945665.png

void divide_n(int w)
{
    size_t length;
    string wordstring;
    char wordchar[999];
    int w2 = 1;
    int i, j, k = 0;
    int pos = 0;
    const char *sep = "./;'[] \\<>?:\"{}|`~!@#$%^&*()_+-=\n"; //需要分割的字符
    char *p;
    char *buf;
    p = strtok_s(s, sep, &buf);
    while (p)
    {
        wordstring = p;
        strcpy_s(wordchar, wordstring.c_str());
        if (strcmp(wordchar, "Title") == 0)
        {
            charnum -= 11;
            linenum++;
            w2 = w;
        } // 出现Title 权重为w;
        else if (strcmp(wordchar, "Abstract") == 0)
        {
            charnum -= 10;
            linenum++;
            w2 = 1;
        }// 出现Abstract 权重为1;
        else
        {
            length = wordstring.length();
            for (i = 0; i <= length; i++)
            {
                if (wordchar[i] >= 'A' && wordchar[i] <= 'Z')
                {
                    wordchar[i] = wordchar[i] + 32;
                }
            }
            wordstring = wordchar;
            if (wordstring.length() >= 4)
            {
                for (j = 0; j <= 3; j++)//判断该单词是否符合前四位为字母
                {
                    if (wordchar[j] >= 'a' && wordchar[j] <= 'z')
                        pos = 1;
                    else
                    {
                        pos = 0;
                        break;
                    }
                }
            }
            if (pos == 1)
            {
                wordnum++;
                if (w_c.count(wordstring) == 0)
                {
                    w_c.insert(make_pair(wordstring, w2));
                }
                else
                {
                    w_c[wordstring] += w2;
                }
                pos = 0;
            }
        }
        p = strtok_s(NULL, sep, &buf);
    }
    for (w_c_iter = w_c.begin(); w_c_iter != w_c.end(); w_c_iter++)
    {
        w_c2.push_back(make_pair(w_c_iter->first, w_c_iter->second));
    }
    sort(w_c2.begin(), w_c2.end(), Comp);
}
void writetxt_n(char *b, int n)
{
    char charnum_s[10], wordnum_s[10], linenum_s[10];
    char num_s[10];
    string res;
    char res_c[200000];
    _itoa_s(charnum + 2, charnum_s, 10);
    _itoa_s(wordnum, wordnum_s, 10);
    _itoa_s(linenum, linenum_s, 10);
    res = res + "characters: " + charnum_s + "\n";
    res = res + "words: " + wordnum_s + "\n";
    res = res + "lines: " + linenum_s + "\n";
    if (n == 0)
    {
        n = 10;
    }
    if (w_c2.size() >= n)
    {
        for (w_c2_iter = w_c2.begin(); w_c2_iter != w_c2.begin() + n; w_c2_iter++)
        {
            _itoa_s(w_c2_iter->second, num_s, 10);
            res = res + "<" + w_c2_iter->first + ">: " + num_s + "\n";
        }
    }
    else
    {
        for (w_c2_iter = w_c2.begin(); w_c2_iter != w_c2.end(); w_c2_iter++)
        {
            _itoa_s(w_c2_iter->second, num_s, 10);
            res = res + "<" + w_c2_iter->first + ">: " + num_s + "\n";
        }
    }
    strcpy_s(res_c, res.c_str());
    FILE *fp1;
    errno_t err;
    err = fopen_s(&fp1, b, "w");
    fwrite(res_c, res.length(), 1, fp1);
}
3.词组统计(divide_m) (writetxt_m)

1474284-20181010190301119-1498557526.png

void divide_m(int w, int m)
{
    size_t length;
    int cznum = 0;
    string cz;
    string wordstring;
    char wordchar[999];
    int w2 = 1;
    int i, j, k = 0;
    int pos = 0;
    const char *sep = "./;'[] \\<>?:\"{}|`~!@#$%^&*()_+-=\n"; //需要分割的字符
    char *p = NULL;
    char *buf;
    p = strtok_s(s, sep, &buf);
    while (p)
    {
        wordstring = p;
        strcpy_s(wordchar, wordstring.c_str());
        if (strcmp(wordchar, "Title") == 0)
        {
            w2 = w;
            while (cz_q1.empty() == 0)
            {
                cz_q1.pop();
            }
            while (cz_q2.empty() == 0)
            {
                cz_q2.pop();
            }
        } // 出现Title 权重为w;
        else if (strcmp(wordchar, "Abstract") == 0)
        {
            w2 = 1;
            while (cz_q1.empty() == 0)
            {
                cz_q1.pop();
            }
            while (cz_q2.empty() == 0)
            {
                cz_q2.pop();
            }
        }// 出现Abstract 权重为1;
        else
        {
            length = wordstring.length();
            for (i = 0; i <= length; i++)
            {
                if (wordchar[i] >= 'A' && wordchar[i] <= 'Z')
                {

                    wordchar[i] = wordchar[i] + 32;
                }
            }
            wordstring = wordchar;
            if (wordstring.length() >= 4) //合法pos=1  不合法pos=0
            {
                for (j = 0; j <= 3; j++)//判断该单词是否符合前四位为字母
                {
                    if (wordchar[j] >= 'a' && wordchar[j] <= 'z')
                    {
                        pos = 1;
                    }
                    else
                    {
                        pos = 0;
                        break;
                    }
                }
            }
            else
            {
                pos = 0;
            }

            if (pos == 1)
            {
                if (cz_q2.size() == 0)
                {
                    cz = "";
                }
                cz_q1.push(wordstring); //将合法单词入队q1
                cz_q2.push(wordstring); //将合法单词入队q2
                if (cz_q2.size() == m)
                {
                    cz_q1.pop();    //若满足条件称为词组 则q1的首个单词出队
                    for (i = 1; i <= m; i++)   //q2的所有单词存入cz中 用于输出并清空q2
                    {
                        if (i == m)
                        {
                            cz = cz + cz_q2.front();
                            cz_q2.pop();
                        }
                        else
                        {
                            cz = cz + cz_q2.front() + " ";
                            cz_q2.pop();
                        }
                    }
                    if (cz_c.count(cz) == 0) //查询map中是否有该词组 无则将 词组,频率 引入  有则将原有 词组的频率累加
                    {
                        cz_c.insert(make_pair(cz, w2));
                        cz = "";
                    }
                    else
                    {
                        cz_c[cz] += w2;
                        cz = "";
                    }
                    for (j = 1; j <= cz_q1.size(); j++)  //将q1中剩余单词存入pop[]中 同步存入q2
                    {
                        pop[j] = cz_q1.front();
                        cz_q1.pop();
                        cz_q1.push(pop[j]);
                        cz_q2.push(pop[j]);
                    }
                }
            }
            else if (pos == 0) //当遇到非法单词 将两个队列清空
            {
                while (cz_q1.empty() == 0)
                {
                    cz_q1.pop();
                }
                while (cz_q2.empty() == 0)
                {
                    cz_q2.pop();
                }
            }
        }
        p = strtok_s(NULL, sep, &buf);
    }
    for (cz_c_iter = cz_c.begin(); cz_c_iter != cz_c.end(); cz_c_iter++)
    {
        cz_c2.push_back(make_pair(cz_c_iter->first, cz_c_iter->second));
    }
    sort(cz_c2.begin(), cz_c2.end(), Comp);
}
void writetxt_m(char *b)
{
    char charnum_s[10], wordnum_s[10], linenum_s[10];
    char num_s[10];
    string res;
    char res_c[200000];
    _itoa_s(charnum + 2, charnum_s, 10);
    _itoa_s(wordnum, wordnum_s, 10);
    _itoa_s(linenum, linenum_s, 10);
    res = res + "characters: " + charnum_s + "\n";
    res = res + "words: " + wordnum_s + "\n";
    res = res + "lines: " + linenum_s + "\n";
    if (cz_c2.size() >= 10)
    {
        for (cz_c2_iter = cz_c2.begin(); cz_c2_iter != cz_c2.begin() + 10; cz_c2_iter++)
        {
            _itoa_s(cz_c2_iter->second, num_s, 10);
            res = res + "<" + cz_c2_iter->first + ">: " + num_s + "\n";
        }
    }
    else
    {
        for (cz_c2_iter = cz_c2.begin(); cz_c2_iter != cz_c2.end(); cz_c2_iter++)
        {
            _itoa_s(cz_c2_iter->second, num_s, 10);
            res = res + "<" + cz_c2_iter->first + ">: " + num_s + "\n";
        }
    }
    strcpy_s(res_c, res.c_str());
    FILE *fp1;
    errno_t err;
    err = fopen_s(&fp1, b, "w");
    fwrite(res_c, res.length(), 1, fp1);
}

七、测试结果

  • 输入
    1474284-20181010195647857-1486271667.png
  • 输出
    1474284-20181010190640521-1227716977.png

八、性能分析

1474284-20181010192528251-486095212.png

1474284-20181010192531138-1379312612.png

九、附加功能

  • 代码
import json

from pyecharts import Bar, Line, Scatter, EffectScatter, Grid, WordCloud, Graph, Page
name = ["learning","with","image","from","network","that","deep","networks","this","video","visual","neural","detection","model","segmentation","multi"]
value = [2879,2744,2306,1826,1757,1757,1735,1510,1423,1088,1030,952,938,909,889,827]

wordcloud = WordCloud("CVPR热词图谱")
wordcloud.add("", name, value, word_size_range=[20, 100])
wordcloud.render()

1474284-20181010201613750-394610656.gif

十、评价队友

相当棒棒

1474284-20181010193839846-1678030756.png

十一、学习记录

第N周新增代码累计代码本周学习时间累计学习时间(小时)重要成长
120020055对Axure的学习
52004001217html,css的学习
5400800825对c中各种函数的学习

转载于:https://www.cnblogs.com/kkyblog/p/9766631.html

Logo

开放原子开发者工作坊旨在鼓励更多人参与开源活动,与志同道合的开发者们相互交流开发经验、分享开发心得、获取前沿技术趋势。工作坊有多种形式的开发者活动,如meetup、训练营等,主打技术交流,干货满满,真诚地邀请各位开发者共同参与!

更多推荐